Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions main/manager-api/src/main/resources/db/changelog/202510141200.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
-- 为阿里百炼流式语音合成添加多语言音色配置字段
UPDATE `ai_model_provider` SET fields = '[{"key":"api_key","type":"string","label":"API密钥"},{"key":"output_dir","type":"string","label":"输出目录"},{"key":"model","type":"string","label":"模型名称"},{"key":"format","label":"音频格式","type":"string"},{"key":"sample_rate","label":"采样率","type":"number"},{"key": "volume", "type": "number", "label": "音量"},{"key": "rate", "type": "number", "label": "语速"},{"key": "pitch", "type": "number", "label": "音调"},{"key":"voice","type":"string","label":"默认音色"},{"key": "voice_zh", "type": "string", "label": "中文音色"},{"key": "voice_yue", "type": "string", "label": "粤语音色"},{"key": "voice_en", "type": "string", "label": "英语音色"},{"key": "voice_ja", "type": "string", "label": "日语音色"},{"key": "voice_ko", "type": "string", "label": "韩语音色"}]' WHERE id = 'SYSTEM_TTS_AliBLStreamTTS';

-- 更新配置说明
UPDATE `ai_model_config` SET
`doc_link` = 'https://bailian.console.aliyun.com/?apiKey=1#/api-key',
`remark` = '阿里百炼流式TTS说明:
1. 访问 https://bailian.console.aliyun.com/?apiKey=1#/api-key 创建项目并获取appkey
2. 支持实时流式合成,具有较低的延迟
3. 支持多种音色设置和音频参数调整
4. 使用FunASR进行语音识别时,可以自动选择对应语言音色
5. 支持CosyVoice-V3大模型音色,价格实惠(0.4元/万字符)
6. 支持实时调节音量、语速、音调等参数
7. 如果需要使用CosyVoice-V3模型和一些限制类型的音色,需要联系阿里百炼客服申请
' WHERE `id` = 'TTS_AliBLStreamTTS';
10 changes: 10 additions & 0 deletions main/xiaozhi-server/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,16 @@ TTS:
access_key_secret: 你的阿里云账号access_key_secret
# 截至2025年7月21日大模型音色只有北京节点采用,其他节点暂不支持
host: nls-gateway-cn-beijing.aliyuncs.com

# 多语言音色配置 - 根据ASR识别的语言标签自动切换音色
# 多语言仅限搭配FunASR SenseVoiceSmall模型使用
voice_zh: longxiaochun # 中文音色
voice_en: longchen # 英文音色
voice_yue: longyu # 粤语音色
voice_ja: longchen # 日语音色
voice_ko: longchen # 韩语音色
default_voice: longxiaochun # 默认音色(当语言标签不匹配或无语言标签时使用)

# 以下可不用设置,使用默认设置
# format: pcm # 音频格式:pcm、wav、mp3
# sample_rate: 16000 # 采样率:8000、16000、24000
Expand Down
1 change: 1 addition & 0 deletions main/xiaozhi-server/core/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def __init__(
# 所以涉及到ASR的变量,需要在这里定义,属于connection的私有变量
self.asr_audio = []
self.asr_audio_queue = queue.Queue()
self.current_language_tag = None # 存储当前ASR识别的语言标签

# llm相关变量
self.llm_finish_task = True
Expand Down
31 changes: 31 additions & 0 deletions main/xiaozhi-server/core/handle/receiveAudioHandle.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import time
import json
import asyncio
Expand Down Expand Up @@ -41,6 +42,32 @@ async def startToChat(conn, text):
# 检查输入是否是JSON格式(包含说话人信息)
speaker_name = None
actual_text = text
language_tag = None

# 检查当前使用的ASR是否为FunASR(本地或服务版本)
is_funasr = False
if hasattr(conn, 'asr') and conn.asr:
asr_module = conn.asr.__class__.__module__
if 'fun_local' in asr_module or 'fun_server' in asr_module:
is_funasr = True
conn.logger.bind(tag=TAG).debug(f"检测到FunASR语音识别: {asr_module}")

# 只有在使用FunASR时才处理语言标签
if is_funasr:
# 检查是否包含语言标签(如<|zh|>、<|en|>等)
lang_pattern = r'<\|([a-z]{2,3})\|>'
lang_match = re.search(lang_pattern, text)
if lang_match:
language_tag = lang_match.group(1)
conn.current_language_tag = language_tag
conn.logger.bind(tag=TAG).info(f"检测到FunASR语言标签: {language_tag}")

# 移除语言标签,保留纯文本内容
actual_text = re.sub(lang_pattern, '', text).strip()
conn.logger.bind(tag=TAG).debug(f"移除语言标签后的文本: {actual_text}")
else:
# 没有检测到语言标签时,清空之前的标签
conn.current_language_tag = None

try:
# 尝试解析JSON格式的输入
Expand All @@ -63,6 +90,10 @@ async def startToChat(conn, text):
else:
conn.current_speaker = None

# 如果不是FunASR,清空语言标签,不影响其他ASR
if not is_funasr:
conn.current_language_tag = None

if conn.need_bind:
await check_bind_device(conn)
return
Expand Down
7 changes: 6 additions & 1 deletion main/xiaozhi-server/core/providers/asr/fun_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from config.logger import setup_logging
from typing import Optional, Tuple, List
from core.providers.asr.base import ASRProviderBase
from core.providers.asr.utils import custom_lang_filter
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
import shutil
Expand Down Expand Up @@ -99,7 +100,11 @@ async def speech_to_text(
use_itn=True,
batch_size_s=60,
)
text = rich_transcription_postprocess(result[0]["text"])

# text = rich_transcription_postprocess(result[0]["text"])

# Handle language tags
text = custom_lang_filter(result[0]["text"])
logger.bind(tag=TAG).debug(
f"语音识别耗时: {time.time() - start_time:.3f}s | 结果: {text}"
)
Expand Down
12 changes: 8 additions & 4 deletions main/xiaozhi-server/core/providers/asr/fun_server.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from typing import Optional, Tuple, List
from core.providers.asr.base import ASRProviderBase
from core.providers.asr.utils import custom_lang_filter
from core.providers.asr.dto.dto import InterfaceType
import ssl
import json
import websockets
from config.logger import setup_logging
import asyncio
import re

TAG = __name__
logger = setup_logging()
Expand Down Expand Up @@ -151,9 +151,13 @@ async def speech_to_text(

# Get the result from the receive task
result = receive_task.result()
match = re.match(r"<\|(.*?)\|><\|(.*?)\|><\|(.*?)\|>(.*)", result)
if match:
result = match.group(4).strip()

# match = re.match(r"<\|(.*?)\|><\|(.*?)\|><\|(.*?)\|>(.*)", result)
# if match:
# result = match.group(4).strip()

# Handle language tags
result = custom_lang_filter(result)
return (
result,
file_path,
Expand Down
45 changes: 45 additions & 0 deletions main/xiaozhi-server/core/providers/asr/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import re
from config.logger import setup_logging

TAG = __name__
logger = setup_logging()


def custom_lang_filter(text):
"""
自定义过滤函数:只保留语言标签,移除其他所有标签

用于FunASR识别结果的处理,保留语言标签(如<|zh|>、<|en|>等),
但移除其他所有格式的标签(如时间戳、情感标签等)

Args:
text: ASR识别的原始文本,可能包含多种标签

Returns:
str: 处理后的文本,只保留语言标签(如果存在)

Examples:
>>> custom_lang_filter("<|zh|><|emotion:happy|>你好")
'<|zh|>你好'
>>> custom_lang_filter("<|en|>hello world")
'<|en|>hello world'
>>> custom_lang_filter("<|timestamp:1.5|>测试")
'测试'
"""
# 定义语言标签模式
lang_pattern = r"<\|(zh|en|yue|ja|ko|nospeech)\|>"
lang_tags = re.findall(lang_pattern, text)

# 移除所有 < | ... | > 格式的标签
clean_text = re.sub(r"<\|.*?\|>", "", text)

# 在开头添加语言标签(如果存在)
if lang_tags:
if len(lang_tags) > 1:
logger.bind(tag=TAG).warning(
f"检测到多个语言标签: {lang_tags},仅使用第一个: {lang_tags[0]}"
)
clean_text = f"<|{lang_tags[0]}|>{clean_text}"

return clean_text.strip()

53 changes: 51 additions & 2 deletions main/xiaozhi-server/core/providers/tts/alibl_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ def __init__(self, config, delete_audio_file):
if config.get("private_voice"):
self.voice = config.get("private_voice")

# 多语言音色配置
self.voice_zh = config.get("voice_zh", self.voice) # 中文音色
self.voice_yue = config.get("voice_yue", self.voice) # 粤语音色
self.voice_en = config.get("voice_en", self.voice) # 英语音色
self.voice_ja = config.get("voice_ja", self.voice) # 日语音色
self.voice_ko = config.get("voice_ko", self.voice) # 韩语音色

# 音频参数配置
self.format = config.get("format", "pcm")
sample_rate = config.get("sample_rate", "24000")
Expand All @@ -65,6 +72,38 @@ def __init__(self, config, delete_audio_file):
sample_rate=self.sample_rate, channels=1, frame_size_ms=60
)

def get_voice_by_language(self, language_tag):
"""根据语言标签返回对应的音色(仅在FunASR语音识别时生效)"""
if not language_tag:
return self.voice

# 检查当前ASR是否为FunASR
is_funasr = False
if hasattr(self, 'conn') and self.conn and hasattr(self.conn, 'asr') and self.conn.asr:
asr_module = self.conn.asr.__class__.__module__
if 'fun_local' in asr_module or 'fun_server' in asr_module:
is_funasr = True
logger.bind(tag=TAG).debug(f"当前使用FunASR语音识别: {asr_module}")

# 只有在使用FunASR时才应用多语言音色选择
if is_funasr:
language_tag = language_tag.lower()
voice_map = {
'zh': self.voice_zh,
'yue': self.voice_yue,
'en': self.voice_en,
'ja': self.voice_ja,
'ko': self.voice_ko
}

selected_voice = voice_map.get(language_tag, self.voice)
logger.bind(tag=TAG).info(f"FunASR语言标签 '{language_tag}' 选择音色: {selected_voice}")
return selected_voice
else:
# 非FunASR时使用默认音色
logger.bind(tag=TAG).debug(f"非FunASR语音识别,使用默认音色: {self.voice}")
return self.voice

async def _ensure_connection(self):
"""确保WebSocket连接可用,支持60秒内连接复用"""
try:
Expand Down Expand Up @@ -228,6 +267,9 @@ async def start_session(self, session_id):
# 启动监听任务
self._monitor_task = asyncio.create_task(self._start_monitor_tts_response())

# 根据当前语言标签选择音色
current_voice = self.get_voice_by_language(getattr(self.conn, 'current_language_tag', None))

# 发送run-task消息启动会话
run_task_message = {
"header": {
Expand All @@ -242,7 +284,7 @@ async def start_session(self, session_id):
"model": self.model,
"parameters": {
"text_type": "PlainText",
"voice": self.voice,
"voice": current_voice,
"format": self.format,
"sample_rate": self.sample_rate,
"volume": self.volume,
Expand Down Expand Up @@ -412,6 +454,13 @@ async def _generate_audio():
)

try:
# 选择音色:优先使用当前连接的语言标签,否则使用默认音色
# 注意:to_tts可能在独立场景下调用(无active connection)
if hasattr(self, 'conn') and self.conn and hasattr(self.conn, 'current_language_tag'):
current_voice = self.get_voice_by_language(self.conn.current_language_tag)
else:
current_voice = self.voice

# 发送run-task消息启动会话
run_task_message = {
"header": {
Expand All @@ -426,7 +475,7 @@ async def _generate_audio():
"model": self.model,
"parameters": {
"text_type": "PlainText",
"voice": self.voice,
"voice": current_voice,
"format": self.format,
"sample_rate": self.sample_rate,
"volume": self.volume,
Expand Down