fix: remove hardcoded chinese title heuristics (#887)

* fix: remove hardcoded chinese title heuristics

* fix: use english placeholder for non-latin fallback titles
This commit is contained in:
Pavol Biely
2026-04-23 20:45:34 +04:00
committed by GitHub
parent ae7be6deba
commit 96c97c5e0e
3 changed files with 74 additions and 43 deletions

View File

@@ -106,7 +106,7 @@ def _sanitize_generated_title(text: str) -> str:
"""Sanitize LLM-generated title text before persisting to session.""" """Sanitize LLM-generated title text before persisting to session."""
s = _strip_thinking_markup(text or '') s = _strip_thinking_markup(text or '')
s = re.sub( s = re.sub(
r'^\s*(?:[*_`~]+\s*)?(?:session\s+title|title)\s*[:]\s*(?:[*_`~]+\s*)?', r'^\s*(?:[*_`~]+\s*)?(?:session\s+title|title)\s*:\s*(?:[*_`~]+\s*)?',
'', '',
s, s,
flags=re.IGNORECASE, flags=re.IGNORECASE,
@@ -132,10 +132,7 @@ def _looks_invalid_generated_title(text: str) -> bool:
or re.search(r'^\s*(i|we)\s+(should|need to|will|can)\b', s, flags=re.IGNORECASE) or re.search(r'^\s*(i|we)\s+(should|need to|will|can)\b', s, flags=re.IGNORECASE)
or re.search(r'^\s*let me\b', s, flags=re.IGNORECASE) or re.search(r'^\s*let me\b', s, flags=re.IGNORECASE)
or re.search(r"^\s*here(?:'s| is) (?:a |my )?(?:thinking|thought)", s, flags=re.IGNORECASE) or re.search(r"^\s*here(?:'s| is) (?:a |my )?(?:thinking|thought)", s, flags=re.IGNORECASE)
or re.search(r'用户(要求|希望|想让|让我)', s)
or re.search(r'请只?回复', s)
or re.search(r'^\s*(ok|okay|done|all set|complete|completed|finished)\b[\s.!?]*$', s, flags=re.IGNORECASE) or re.search(r'^\s*(ok|okay|done|all set|complete|completed|finished)\b[\s.!?]*$', s, flags=re.IGNORECASE)
or re.search(r'^\s*(好的|好啦|完成了|已完成|测试完成|测试已完成|可以了|没问题)\s*[!。\.\s]*$', s)
) )
@@ -209,10 +206,10 @@ def _title_prompts(user_text: str, assistant_text: str) -> tuple[str, list[str]]
"Return only the title text, 3-8 words, as a topic label.\n" "Return only the title text, 3-8 words, as a topic label.\n"
"Do not use markdown, bullets, labels, or prefixes like Session Title:.\n" "Do not use markdown, bullets, labels, or prefixes like Session Title:.\n"
"Do not output a full sentence.\n" "Do not output a full sentence.\n"
"Do not output acknowledgements or completion phrases like OK, done, all set, 测试完成.\n" "Do not output acknowledgements or completion phrases like OK, done, or all set.\n"
"Do not describe internal reasoning.\n" "Do not describe internal reasoning.\n"
"Bad: The user is asking..., OK, 好的,测试完成!\n" "Bad: The user is asking..., OK, all set.\n"
"Good: 自动标题生成测试, Clarify Dialog Layout, GitHub Issue Triage" "Good: Title Generation Test, Clarify Dialog Layout, GitHub Issue Triage"
), ),
( (
"Rewrite this conversation start as a concise noun-phrase title.\n" "Rewrite this conversation start as a concise noun-phrase title.\n"
@@ -437,10 +434,10 @@ def _fallback_title_from_exchange(user_text: str, assistant_text: str) -> Option
combined = f"{user_text} {assistant_text}".strip().lower() combined = f"{user_text} {assistant_text}".strip().lower()
combined_raw = f"{user_text} {assistant_text}".strip() combined_raw = f"{user_text} {assistant_text}".strip()
def _contains_latin(text: str) -> bool:
return bool(re.search(r'[A-Za-z]', text or ''))
def _extract_named_topic(text: str) -> str: def _extract_named_topic(text: str) -> str:
m = re.search(r'《([^》]{2,24})》', text)
if m:
return (m.group(1) or '').strip()
m = re.search(r'"([^"\n]{2,24})"', text) m = re.search(r'"([^"\n]{2,24})"', text)
if m: if m:
return (m.group(1) or '').strip() return (m.group(1) or '').strip()
@@ -451,57 +448,53 @@ def _fallback_title_from_exchange(user_text: str, assistant_text: str) -> Option
topic_name = _extract_named_topic(combined_raw) topic_name = _extract_named_topic(combined_raw)
if topic_name: if topic_name:
if any(k in combined for k in ('时间', 'time', '安排', '效率', '怎么办', '健身', '唱歌', '写毛笔', '不够用了')): if not _contains_latin(topic_name):
return f'{topic_name}与时间管理' if any(k in combined for k in ('time', 'schedule', 'efficiency', 'manage', 'fitness', 'singing', 'calligraphy')):
return 'Time management discussion'
if any(k in combined for k in ('hermes', 'codex', 'ai')):
return 'AI productivity discussion'
return 'Conversation topic'
if any(k in combined for k in ('time', 'schedule', 'efficiency', 'manage', 'fitness', 'singing', 'calligraphy')):
return f'{topic_name} time management'
if any(k in combined for k in ('hermes', 'codex', 'ai')): if any(k in combined for k in ('hermes', 'codex', 'ai')):
return f'{topic_name}AI效率' return f'{topic_name} AI productivity'
return f'{topic_name}讨论' return f'{topic_name} discussion'
if any(k in combined for k in ('title', '标题')) and any(k in combined for k in ('summary', 'summar', '摘要', '短标题')): if any(k in combined for k in ('title', 'session title')) and any(k in combined for k in ('summary', 'summar', 'short title')):
if any(k in combined for k in ('test', '测试', 'ok', '回复ok')): if any(k in combined for k in ('test', 'ok', 'reply ok')):
return '会话标题自动摘要测试' return 'Session title auto-summary test'
return '会话标题自动摘要' return 'Session title auto-summary'
if any(k in combined for k in ('clarify', '澄清')) and any(k in combined for k in ('dialog', 'card', '对话', '卡片')): if any(k in combined for k in ('clarify', 'clarification')) and any(k in combined for k in ('dialog', 'card')):
return 'Clarify 对话卡片' return 'Clarify dialog card'
if any(k in combined for k in ('issue', 'github', 'pr')) and any(k in combined for k in ('triage', 'bug', 'review', '问题')): if any(k in combined for k in ('issue', 'github', 'pr')) and any(k in combined for k in ('triage', 'bug', 'review')):
return 'GitHub Issue Triage' return 'GitHub Issue Triage'
head = re.split(r'[。!?.!?\n]', user_text)[0].strip() head = re.split(r'[.!?\n]', user_text)[0].strip()
if not head: if not head:
return None return None
stop_cjk = {
'我们', '看看', '一下', '这个', '标题', '是否', '可以', '用户', '理解', '这里', '测试', '一下',
'你只', '需要', '回复', '就可', '可以', '不需', '需要做', '什么', '自动', '成用户', '短标题',
}
stop_en = { stop_en = {
'the', 'this', 'that', 'with', 'from', 'into', 'just', 'reply', 'please', 'the', 'this', 'that', 'with', 'from', 'into', 'just', 'reply', 'please',
'need', 'needs', 'want', 'wants', 'user', 'assistant', 'could', 'would', 'need', 'needs', 'want', 'wants', 'user', 'assistant', 'could', 'would',
'should', 'about', 'there', 'here', 'test', 'testing', 'title', 'summary', 'should', 'about', 'there', 'here', 'test', 'testing', 'title', 'summary',
} }
tokens = re.findall(r'[\u4e00-\u9fff]{2,6}|[A-Za-z0-9][A-Za-z0-9_./+-]*', head) tokens = re.findall(r'[A-Za-z0-9][A-Za-z0-9_./+-]*', head)
if not tokens: if not tokens:
return head[:64] return 'Conversation topic'
picked = [] picked = []
for tok in tokens: for tok in tokens:
lower_tok = tok.lower() lower_tok = tok.lower()
if re.search(r'[\u4e00-\u9fff]', tok): if lower_tok in stop_en or len(lower_tok) < 3:
if tok in stop_cjk: continue
continue
else:
if lower_tok in stop_en or len(lower_tok) < 3:
continue
if tok not in picked: if tok not in picked:
picked.append(tok) picked.append(tok)
if len(picked) >= 4: if len(picked) >= 4:
break break
if picked: if picked:
if any(re.search(r'[\u4e00-\u9fff]', t) for t in picked):
return ''.join(picked)[:20]
return ' '.join(picked)[:60] return ' '.join(picked)[:60]
return head[:24] return 'Conversation topic'
def _run_background_title_update(session_id: str, user_text: str, assistant_text: str, placeholder_title: str, put_event, agent=None): def _run_background_title_update(session_id: str, user_text: str, assistant_text: str, placeholder_title: str, put_event, agent=None):

View File

@@ -137,16 +137,21 @@ class TestIssue495TitleStreaming(unittest.TestCase):
) )
def test_streaming_rejects_generic_completion_titles(self): def test_streaming_rejects_generic_completion_titles(self):
self.assertIn(
"测试完成",
STREAMING_PY,
"streaming.py should reject generic completion phrases as session titles",
)
self.assertIn( self.assertIn(
"all set", "all set",
STREAMING_PY, STREAMING_PY,
"streaming.py should reject generic English completion phrases as session titles", "streaming.py should reject generic English completion phrases as session titles",
) )
self.assertIn(
"completed",
STREAMING_PY,
"streaming.py should reject completion-status titles as session titles",
)
self.assertNotIn(
"测试完成",
STREAMING_PY,
"streaming.py title generation should stay English-only",
)
def test_streaming_uses_reasoning_split_for_minimax_titles(self): def test_streaming_uses_reasoning_split_for_minimax_titles(self):
self.assertIn( self.assertIn(

View File

@@ -1,6 +1,11 @@
import unittest import unittest
from pathlib import Path
from api.streaming import _first_exchange_snippets, _sanitize_generated_title from api.streaming import (
_fallback_title_from_exchange,
_first_exchange_snippets,
_sanitize_generated_title,
)
class TestGeneratedTitleSanitization(unittest.TestCase): class TestGeneratedTitleSanitization(unittest.TestCase):
@@ -33,3 +38,31 @@ class TestGeneratedTitleSanitization(unittest.TestCase):
_first_exchange_snippets(messages), _first_exchange_snippets(messages),
("What time is it in San Francisco?", "It is 6:16 PM in San Francisco."), ("What time is it in San Francisco?", "It is 6:16 PM in San Francisco."),
) )
def test_fallback_title_uses_english_discussion_suffix(self):
self.assertEqual(
_fallback_title_from_exchange('Please review "random cancel"', ""),
"random cancel discussion",
)
def test_fallback_title_summary_label_is_english(self):
self.assertEqual(
_fallback_title_from_exchange("Generate a short title summary test", ""),
"Session title auto-summary test",
)
def test_fallback_title_non_latin_input_uses_english_placeholder(self):
self.assertEqual(
_fallback_title_from_exchange("讨论一下这个问题", ""),
"Conversation topic",
)
def test_fallback_title_non_latin_quoted_topic_uses_english_placeholder(self):
self.assertEqual(
_fallback_title_from_exchange('Please review "讨论主题"', ""),
"Conversation topic",
)
def test_title_generation_source_has_no_cjk_literals(self):
src = Path("api/streaming.py").read_text(encoding="utf-8")
self.assertNotRegex(src, r"[\u4e00-\u9fff]", "title generation code should stay English-only")