xinnan-tech · Packeting1 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 20, 2025
diff --git a/main/manager-api/src/main/resources/db/changelog/202510141200.sql b/main/manager-api/src/main/resources/db/changelog/202510141200.sql
@@ -0,0 +1,15 @@
+-- 为阿里百炼流式语音合成添加多语言音色配置字段
+UPDATE `ai_model_provider` SET fields = '[{"key":"api_key","type":"string","label":"API密钥"},{"key":"output_dir","type":"string","label":"输出目录"},{"key":"model","type":"string","label":"模型名称"},{"key":"format","label":"音频格式","type":"string"},{"key":"sample_rate","label":"采样率","type":"number"},{"key": "volume", "type": "number", "label": "音量"},{"key": "rate", "type": "number", "label": "语速"},{"key": "pitch", "type": "number", "label": "音调"},{"key":"voice","type":"string","label":"默认音色"},{"key": "voice_zh", "type": "string", "label": "中文音色"},{"key": "voice_yue", "type": "string", "label": "粤语音色"},{"key": "voice_en", "type": "string", "label": "英语音色"},{"key": "voice_ja", "type": "string", "label": "日语音色"},{"key": "voice_ko", "type": "string", "label": "韩语音色"}]' WHERE id = 'SYSTEM_TTS_AliBLStreamTTS';
+
+-- 更新配置说明
+UPDATE `ai_model_config` SET
+`doc_link` = 'https://bailian.console.aliyun.com/?apiKey=1#/api-key',
+`remark` = '阿里百炼流式TTS说明：
+1. 访问 https://bailian.console.aliyun.com/?apiKey=1#/api-key 创建项目并获取appkey
+2. 支持实时流式合成，具有较低的延迟
+3. 支持多种音色设置和音频参数调整
+4. 使用FunASR进行语音识别时，可以自动选择对应语言音色
+5. 支持CosyVoice-V3大模型音色，价格实惠(0.4元/万字符)
+6. 支持实时调节音量、语速、音调等参数
+7. 如果需要使用CosyVoice-V3模型和一些限制类型的音色，需要联系阿里百炼客服申请
+' WHERE `id` = 'TTS_AliBLStreamTTS';
diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml
@@ -838,6 +838,16 @@ TTS:
     access_key_secret: 你的阿里云账号access_key_secret
     # 截至2025年7月21日大模型音色只有北京节点采用，其他节点暂不支持
     host: nls-gateway-cn-beijing.aliyuncs.com
+
+    # 多语言音色配置 - 根据ASR识别的语言标签自动切换音色
+    # 多语言仅限搭配FunASR SenseVoiceSmall模型使用
+    voice_zh: longxiaochun   # 中文音色
+    voice_en: longchen       # 英文音色  
+    voice_yue: longyu        # 粤语音色
+    voice_ja: longchen       # 日语音色
+    voice_ko: longchen       # 韩语音色
+    default_voice: longxiaochun  # 默认音色（当语言标签不匹配或无语言标签时使用）
+
     # 以下可不用设置，使用默认设置
     # format: pcm  # 音频格式：pcm、wav、mp3
     # sample_rate: 16000  # 采样率：8000、16000、24000

diff --git a/main/xiaozhi-server/core/connection.py b/main/xiaozhi-server/core/connection.py
@@ -129,6 +129,7 @@ def __init__(
         # 所以涉及到ASR的变量，需要在这里定义，属于connection的私有变量
         self.asr_audio = []
         self.asr_audio_queue = queue.Queue()
+        self.current_language_tag = None  # 存储当前ASR识别的语言标签
 
         # llm相关变量
         self.llm_finish_task = True

diff --git a/main/xiaozhi-server/core/handle/receiveAudioHandle.py b/main/xiaozhi-server/core/handle/receiveAudioHandle.py
@@ -1,3 +1,4 @@
+import re
 import time
 import json
 import asyncio
@@ -41,6 +42,32 @@ async def startToChat(conn, text):
     # 检查输入是否是JSON格式（包含说话人信息）
     speaker_name = None
     actual_text = text
+    language_tag = None
+
+    # 检查当前使用的ASR是否为FunASR（本地或服务版本）
+    is_funasr = False
+    if hasattr(conn, 'asr') and conn.asr:
+        asr_module = conn.asr.__class__.__module__
+        if 'fun_local' in asr_module or 'fun_server' in asr_module:
+            is_funasr = True
+            conn.logger.bind(tag=TAG).debug(f"检测到FunASR语音识别: {asr_module}")
+
+    # 只有在使用FunASR时才处理语言标签
+    if is_funasr:
+        # 检查是否包含语言标签（如<|zh|>、<|en|>等）
+        lang_pattern = r'<\|([a-z]{2,3})\|>'
+        lang_match = re.search(lang_pattern, text)
+        if lang_match:
+            language_tag = lang_match.group(1)
+            conn.current_language_tag = language_tag
+            conn.logger.bind(tag=TAG).info(f"检测到FunASR语言标签: {language_tag}")
+
+            # 移除语言标签，保留纯文本内容
+            actual_text = re.sub(lang_pattern, '', text).strip()
+            conn.logger.bind(tag=TAG).debug(f"移除语言标签后的文本: {actual_text}")
+        else:
+            # 没有检测到语言标签时，清空之前的标签
+            conn.current_language_tag = None
 
     try:
         # 尝试解析JSON格式的输入
@@ -63,6 +90,10 @@ async def startToChat(conn, text):
     else:
         conn.current_speaker = None
 
+    # 如果不是FunASR，清空语言标签，不影响其他ASR
+    if not is_funasr:
+        conn.current_language_tag = None
+
     if conn.need_bind:
         await check_bind_device(conn)
         return

diff --git a/main/xiaozhi-server/core/providers/asr/fun_local.py b/main/xiaozhi-server/core/providers/asr/fun_local.py
@@ -6,6 +6,7 @@
 from config.logger import setup_logging
 from typing import Optional, Tuple, List
 from core.providers.asr.base import ASRProviderBase
+from core.providers.asr.utils import custom_lang_filter
 from funasr import AutoModel
 from funasr.utils.postprocess_utils import rich_transcription_postprocess
 import shutil
@@ -99,7 +100,11 @@ async def speech_to_text(
                     use_itn=True,
                     batch_size_s=60,
                 )
-                text = rich_transcription_postprocess(result[0]["text"])
+
+                # text = rich_transcription_postprocess(result[0]["text"])
+
+                # Handle language tags
+                text = custom_lang_filter(result[0]["text"])
                 logger.bind(tag=TAG).debug(
                     f"语音识别耗时: {time.time() - start_time:.3f}s | 结果: {text}"
                 )

diff --git a/main/xiaozhi-server/core/providers/asr/fun_server.py b/main/xiaozhi-server/core/providers/asr/fun_server.py
@@ -1,12 +1,12 @@
 from typing import Optional, Tuple, List
 from core.providers.asr.base import ASRProviderBase
+from core.providers.asr.utils import custom_lang_filter
 from core.providers.asr.dto.dto import InterfaceType
 import ssl
 import json
 import websockets
 from config.logger import setup_logging
 import asyncio
-import re
 
 TAG = __name__
 logger = setup_logging()
@@ -151,9 +151,13 @@ async def speech_to_text(
 
                 # Get the result from the receive task
                 result = receive_task.result()
-                match = re.match(r"<\|(.*?)\|><\|(.*?)\|><\|(.*?)\|>(.*)", result)
-                if match:
-                    result = match.group(4).strip()
+
+                # match = re.match(r"<\|(.*?)\|><\|(.*?)\|><\|(.*?)\|>(.*)", result)
+                # if match:
+                #     result = match.group(4).strip()
+
+                # Handle language tags
+                result = custom_lang_filter(result)
                 return (
                     result,
                     file_path,

diff --git a/main/xiaozhi-server/core/providers/asr/utils.py b/main/xiaozhi-server/core/providers/asr/utils.py
@@ -0,0 +1,45 @@
+import re
+from config.logger import setup_logging
+
+TAG = __name__
+logger = setup_logging()
+
+
+def custom_lang_filter(text):
+    """
+    自定义过滤函数：只保留语言标签，移除其他所有标签
+
+    用于FunASR识别结果的处理，保留语言标签（如<|zh|>、<|en|>等），
+    但移除其他所有格式的标签（如时间戳、情感标签等）
+
+    Args:
+        text: ASR识别的原始文本，可能包含多种标签
+
+    Returns:
+        str: 处理后的文本，只保留语言标签（如果存在）
+
+    Examples:
+        >>> custom_lang_filter("<|zh|><|emotion:happy|>你好")
+        '<|zh|>你好'
+        >>> custom_lang_filter("<|en|>hello world")
+        '<|en|>hello world'
+        >>> custom_lang_filter("<|timestamp:1.5|>测试")
+        '测试'
+    """
+    # 定义语言标签模式
+    lang_pattern = r"<\|(zh|en|yue|ja|ko|nospeech)\|>"
+    lang_tags = re.findall(lang_pattern, text)
+
+    # 移除所有 < | ... | > 格式的标签
+    clean_text = re.sub(r"<\|.*?\|>", "", text)
+
+    # 在开头添加语言标签（如果存在）
+    if lang_tags:
+        if len(lang_tags) > 1:
+            logger.bind(tag=TAG).warning(
+                f"检测到多个语言标签: {lang_tags}，仅使用第一个: {lang_tags[0]}"
+            )
+        clean_text = f"<|{lang_tags[0]}|>{clean_text}"
+
+    return clean_text.strip()
+
diff --git a/main/xiaozhi-server/core/providers/tts/alibl_stream.py b/main/xiaozhi-server/core/providers/tts/alibl_stream.py
@@ -39,6 +39,13 @@ def __init__(self, config, delete_audio_file):
         if config.get("private_voice"):
             self.voice = config.get("private_voice")
 
+        # 多语言音色配置
+        self.voice_zh = config.get("voice_zh", self.voice)  # 中文音色
+        self.voice_yue = config.get("voice_yue", self.voice)  # 粤语音色
+        self.voice_en = config.get("voice_en", self.voice)  # 英语音色
+        self.voice_ja = config.get("voice_ja", self.voice)  # 日语音色
+        self.voice_ko = config.get("voice_ko", self.voice)  # 韩语音色
+
         # 音频参数配置
         self.format = config.get("format", "pcm")
         sample_rate = config.get("sample_rate", "24000")
@@ -65,6 +72,38 @@ def __init__(self, config, delete_audio_file):
             sample_rate=self.sample_rate, channels=1, frame_size_ms=60
         )
 
+    def get_voice_by_language(self, language_tag):
+        """根据语言标签返回对应的音色（仅在FunASR语音识别时生效）"""
+        if not language_tag:
+            return self.voice
+
+        # 检查当前ASR是否为FunASR
+        is_funasr = False
+        if hasattr(self, 'conn') and self.conn and hasattr(self.conn, 'asr') and self.conn.asr:
+            asr_module = self.conn.asr.__class__.__module__
+            if 'fun_local' in asr_module or 'fun_server' in asr_module:
+                is_funasr = True
+                logger.bind(tag=TAG).debug(f"当前使用FunASR语音识别: {asr_module}")
+
+        # 只有在使用FunASR时才应用多语言音色选择
+        if is_funasr:
+            language_tag = language_tag.lower()
+            voice_map = {
+                'zh': self.voice_zh,
+                'yue': self.voice_yue,
+                'en': self.voice_en,
+                'ja': self.voice_ja,
+                'ko': self.voice_ko
+            }
+
+            selected_voice = voice_map.get(language_tag, self.voice)
+            logger.bind(tag=TAG).info(f"FunASR语言标签 '{language_tag}' 选择音色: {selected_voice}")
+            return selected_voice
+        else:
+            # 非FunASR时使用默认音色
+            logger.bind(tag=TAG).debug(f"非FunASR语音识别，使用默认音色: {self.voice}")
+            return self.voice
+
     async def _ensure_connection(self):
         """确保WebSocket连接可用，支持60秒内连接复用"""
         try:
@@ -228,6 +267,9 @@ async def start_session(self, session_id):
             # 启动监听任务
             self._monitor_task = asyncio.create_task(self._start_monitor_tts_response())
 
+            # 根据当前语言标签选择音色
+            current_voice = self.get_voice_by_language(getattr(self.conn, 'current_language_tag', None))
+
             # 发送run-task消息启动会话
             run_task_message = {
                 "header": {
@@ -242,7 +284,7 @@ async def start_session(self, session_id):
                     "model": self.model,
                     "parameters": {
                         "text_type": "PlainText",
-                        "voice": self.voice,
+                        "voice": current_voice,
                         "format": self.format,
                         "sample_rate": self.sample_rate,
                         "volume": self.volume,
@@ -412,6 +454,13 @@ async def _generate_audio():
                 )
 
                 try:
+                    # 选择音色：优先使用当前连接的语言标签，否则使用默认音色
+                    # 注意：to_tts可能在独立场景下调用（无active connection）
+                    if hasattr(self, 'conn') and self.conn and hasattr(self.conn, 'current_language_tag'):
+                        current_voice = self.get_voice_by_language(self.conn.current_language_tag)
+                    else:
+                        current_voice = self.voice
+
                     # 发送run-task消息启动会话
                     run_task_message = {
                         "header": {
@@ -426,7 +475,7 @@ async def _generate_audio():
                             "model": self.model,
                             "parameters": {
                                 "text_type": "PlainText",
-                                "voice": self.voice,
+                                "voice": current_voice,
                                 "format": self.format,
                                 "sample_rate": self.sample_rate,
                                 "volume": self.volume,