336 lines
10 KiB
JavaScript
336 lines
10 KiB
JavaScript
|
|
/**
|
|||
|
|
* 火山引擎 TTS API 封装
|
|||
|
|
* V3 单向流式 + V1试用
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
const V3_URL = 'https://openspeech.bytedance.com/api/v3/tts/unidirectional';
|
|||
|
|
const FREE_V1_URL = 'https://hstts.velure.top';
|
|||
|
|
|
|||
|
|
export const FREE_VOICES = [
|
|||
|
|
{ key: 'female_1', name: '桃夭', tag: '甜蜜仙子', gender: 'female' },
|
|||
|
|
{ key: 'female_2', name: '霜华', tag: '清冷仙子', gender: 'female' },
|
|||
|
|
{ key: 'female_3', name: '顾姐', tag: '御姐烟嗓', gender: 'female' },
|
|||
|
|
{ key: 'female_4', name: '苏菲', tag: '优雅知性', gender: 'female' },
|
|||
|
|
{ key: 'female_5', name: '嘉欣', tag: '港风甜心', gender: 'female' },
|
|||
|
|
{ key: 'female_6', name: '青梅', tag: '清秀少年音', gender: 'female' },
|
|||
|
|
{ key: 'female_7', name: '可莉', tag: '奶音萝莉', gender: 'female' },
|
|||
|
|
{ key: 'male_1', name: '夜枭', tag: '磁性低音', gender: 'male' },
|
|||
|
|
{ key: 'male_2', name: '君泽', tag: '温润公子', gender: 'male' },
|
|||
|
|
{ key: 'male_3', name: '沐阳', tag: '沉稳暖男', gender: 'male' },
|
|||
|
|
{ key: 'male_4', name: '梓辛', tag: '青春少年', gender: 'male' },
|
|||
|
|
];
|
|||
|
|
|
|||
|
|
export const FREE_DEFAULT_VOICE = 'female_1';
|
|||
|
|
|
|||
|
|
// ============ 内部工具 ============
|
|||
|
|
|
|||
|
|
async function proxyFetch(url, options = {}) {
|
|||
|
|
const proxyUrl = '/proxy/' + encodeURIComponent(url);
|
|||
|
|
return fetch(proxyUrl, options);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function safeTail(value) {
|
|||
|
|
return value ? String(value).slice(-4) : '';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============ V3 鉴权模式 ============
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* V3 单向流式合成(完整下载)
|
|||
|
|
*/
|
|||
|
|
export async function synthesizeV3(params, authHeaders = {}) {
|
|||
|
|
const {
|
|||
|
|
appId,
|
|||
|
|
accessKey,
|
|||
|
|
resourceId = 'seed-tts-2.0',
|
|||
|
|
uid = 'st_user',
|
|||
|
|
text,
|
|||
|
|
speaker,
|
|||
|
|
model,
|
|||
|
|
format = 'mp3',
|
|||
|
|
sampleRate = 24000,
|
|||
|
|
speechRate = 0,
|
|||
|
|
loudnessRate = 0,
|
|||
|
|
emotion,
|
|||
|
|
emotionScale,
|
|||
|
|
contextTexts,
|
|||
|
|
explicitLanguage,
|
|||
|
|
disableMarkdownFilter = true,
|
|||
|
|
disableEmojiFilter,
|
|||
|
|
enableLanguageDetector,
|
|||
|
|
maxLengthToFilterParenthesis,
|
|||
|
|
postProcessPitch,
|
|||
|
|
cacheConfig,
|
|||
|
|
} = params;
|
|||
|
|
|
|||
|
|
if (!appId || !accessKey || !text || !speaker) {
|
|||
|
|
throw new Error('缺少必要参数: appId/accessKey/text/speaker');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
console.log('[TTS API] V3 request:', {
|
|||
|
|
appIdTail: safeTail(appId),
|
|||
|
|
accessKeyTail: safeTail(accessKey),
|
|||
|
|
resourceId,
|
|||
|
|
speaker,
|
|||
|
|
textLength: text.length,
|
|||
|
|
hasContextTexts: !!contextTexts?.length,
|
|||
|
|
hasEmotion: !!emotion,
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
const additions = {};
|
|||
|
|
if (contextTexts?.length) additions.context_texts = contextTexts;
|
|||
|
|
if (explicitLanguage) additions.explicit_language = explicitLanguage;
|
|||
|
|
if (disableMarkdownFilter) additions.disable_markdown_filter = true;
|
|||
|
|
if (disableEmojiFilter) additions.disable_emoji_filter = true;
|
|||
|
|
if (enableLanguageDetector) additions.enable_language_detector = true;
|
|||
|
|
if (Number.isFinite(maxLengthToFilterParenthesis)) {
|
|||
|
|
additions.max_length_to_filter_parenthesis = maxLengthToFilterParenthesis;
|
|||
|
|
}
|
|||
|
|
if (Number.isFinite(postProcessPitch) && postProcessPitch !== 0) {
|
|||
|
|
additions.post_process = { pitch: postProcessPitch };
|
|||
|
|
}
|
|||
|
|
if (cacheConfig && typeof cacheConfig === 'object') {
|
|||
|
|
additions.cache_config = cacheConfig;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const body = {
|
|||
|
|
user: { uid },
|
|||
|
|
req_params: {
|
|||
|
|
text,
|
|||
|
|
speaker,
|
|||
|
|
audio_params: {
|
|||
|
|
format,
|
|||
|
|
sample_rate: sampleRate,
|
|||
|
|
speech_rate: speechRate,
|
|||
|
|
loudness_rate: loudnessRate,
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
if (model) body.req_params.model = model;
|
|||
|
|
if (emotion) {
|
|||
|
|
body.req_params.audio_params.emotion = emotion;
|
|||
|
|
body.req_params.audio_params.emotion_scale = emotionScale || 4;
|
|||
|
|
}
|
|||
|
|
if (Object.keys(additions).length > 0) {
|
|||
|
|
body.req_params.additions = JSON.stringify(additions);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const resp = await proxyFetch(V3_URL, {
|
|||
|
|
method: 'POST',
|
|||
|
|
headers: authHeaders,
|
|||
|
|
body: JSON.stringify(body),
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
const logid = resp.headers.get('X-Tt-Logid') || '';
|
|||
|
|
if (!resp.ok) {
|
|||
|
|
const errText = await resp.text().catch(() => '');
|
|||
|
|
throw new Error(`V3 请求失败: ${resp.status} ${errText}${logid ? ` (logid: ${logid})` : ''}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const reader = resp.body.getReader();
|
|||
|
|
const decoder = new TextDecoder();
|
|||
|
|
const audioChunks = [];
|
|||
|
|
let usage = null;
|
|||
|
|
let buffer = '';
|
|||
|
|
|
|||
|
|
while (true) {
|
|||
|
|
const { done, value } = await reader.read();
|
|||
|
|
if (done) break;
|
|||
|
|
|
|||
|
|
buffer += decoder.decode(value, { stream: true });
|
|||
|
|
const lines = buffer.split('\n');
|
|||
|
|
buffer = lines.pop() || '';
|
|||
|
|
|
|||
|
|
for (const line of lines) {
|
|||
|
|
if (!line.trim()) continue;
|
|||
|
|
try {
|
|||
|
|
const json = JSON.parse(line);
|
|||
|
|
if (json.data) {
|
|||
|
|
const binary = atob(json.data);
|
|||
|
|
const bytes = new Uint8Array(binary.length);
|
|||
|
|
for (let i = 0; i < binary.length; i++) {
|
|||
|
|
bytes[i] = binary.charCodeAt(i);
|
|||
|
|
}
|
|||
|
|
audioChunks.push(bytes);
|
|||
|
|
}
|
|||
|
|
if (json.code === 20000000 && json.usage) {
|
|||
|
|
usage = json.usage;
|
|||
|
|
}
|
|||
|
|
} catch {}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (audioChunks.length === 0) {
|
|||
|
|
throw new Error(`未收到音频数据${logid ? ` (logid: ${logid})` : ''}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
audioBlob: new Blob(audioChunks, { type: 'audio/mpeg' }),
|
|||
|
|
usage,
|
|||
|
|
logid,
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* V3 单向流式合成(边生成边回调)
|
|||
|
|
*/
|
|||
|
|
export async function synthesizeV3Stream(params, authHeaders = {}, options = {}) {
|
|||
|
|
const {
|
|||
|
|
appId,
|
|||
|
|
accessKey,
|
|||
|
|
uid = 'st_user',
|
|||
|
|
text,
|
|||
|
|
speaker,
|
|||
|
|
model,
|
|||
|
|
format = 'mp3',
|
|||
|
|
sampleRate = 24000,
|
|||
|
|
speechRate = 0,
|
|||
|
|
loudnessRate = 0,
|
|||
|
|
emotion,
|
|||
|
|
emotionScale,
|
|||
|
|
contextTexts,
|
|||
|
|
explicitLanguage,
|
|||
|
|
disableMarkdownFilter = true,
|
|||
|
|
disableEmojiFilter,
|
|||
|
|
enableLanguageDetector,
|
|||
|
|
maxLengthToFilterParenthesis,
|
|||
|
|
postProcessPitch,
|
|||
|
|
cacheConfig,
|
|||
|
|
} = params;
|
|||
|
|
|
|||
|
|
if (!appId || !accessKey || !text || !speaker) {
|
|||
|
|
throw new Error('缺少必要参数: appId/accessKey/text/speaker');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const additions = {};
|
|||
|
|
if (contextTexts?.length) additions.context_texts = contextTexts;
|
|||
|
|
if (explicitLanguage) additions.explicit_language = explicitLanguage;
|
|||
|
|
if (disableMarkdownFilter) additions.disable_markdown_filter = true;
|
|||
|
|
if (disableEmojiFilter) additions.disable_emoji_filter = true;
|
|||
|
|
if (enableLanguageDetector) additions.enable_language_detector = true;
|
|||
|
|
if (Number.isFinite(maxLengthToFilterParenthesis)) {
|
|||
|
|
additions.max_length_to_filter_parenthesis = maxLengthToFilterParenthesis;
|
|||
|
|
}
|
|||
|
|
if (Number.isFinite(postProcessPitch) && postProcessPitch !== 0) {
|
|||
|
|
additions.post_process = { pitch: postProcessPitch };
|
|||
|
|
}
|
|||
|
|
if (cacheConfig && typeof cacheConfig === 'object') {
|
|||
|
|
additions.cache_config = cacheConfig;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const body = {
|
|||
|
|
user: { uid },
|
|||
|
|
req_params: {
|
|||
|
|
text,
|
|||
|
|
speaker,
|
|||
|
|
audio_params: {
|
|||
|
|
format,
|
|||
|
|
sample_rate: sampleRate,
|
|||
|
|
speech_rate: speechRate,
|
|||
|
|
loudness_rate: loudnessRate,
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
if (model) body.req_params.model = model;
|
|||
|
|
if (emotion) {
|
|||
|
|
body.req_params.audio_params.emotion = emotion;
|
|||
|
|
body.req_params.audio_params.emotion_scale = emotionScale || 4;
|
|||
|
|
}
|
|||
|
|
if (Object.keys(additions).length > 0) {
|
|||
|
|
body.req_params.additions = JSON.stringify(additions);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const resp = await proxyFetch(V3_URL, {
|
|||
|
|
method: 'POST',
|
|||
|
|
headers: authHeaders,
|
|||
|
|
body: JSON.stringify(body),
|
|||
|
|
signal: options.signal,
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
const logid = resp.headers.get('X-Tt-Logid') || '';
|
|||
|
|
if (!resp.ok) {
|
|||
|
|
const errText = await resp.text().catch(() => '');
|
|||
|
|
throw new Error(`V3 请求失败: ${resp.status} ${errText}${logid ? ` (logid: ${logid})` : ''}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const reader = resp.body?.getReader();
|
|||
|
|
if (!reader) throw new Error('V3 响应流不可用');
|
|||
|
|
|
|||
|
|
const decoder = new TextDecoder();
|
|||
|
|
let usage = null;
|
|||
|
|
let buffer = '';
|
|||
|
|
|
|||
|
|
while (true) {
|
|||
|
|
const { done, value } = await reader.read();
|
|||
|
|
if (done) break;
|
|||
|
|
|
|||
|
|
buffer += decoder.decode(value, { stream: true });
|
|||
|
|
const lines = buffer.split('\n');
|
|||
|
|
buffer = lines.pop() || '';
|
|||
|
|
|
|||
|
|
for (const line of lines) {
|
|||
|
|
if (!line.trim()) continue;
|
|||
|
|
try {
|
|||
|
|
const json = JSON.parse(line);
|
|||
|
|
if (json.data) {
|
|||
|
|
const binary = atob(json.data);
|
|||
|
|
const bytes = new Uint8Array(binary.length);
|
|||
|
|
for (let i = 0; i < binary.length; i++) {
|
|||
|
|
bytes[i] = binary.charCodeAt(i);
|
|||
|
|
}
|
|||
|
|
options.onChunk?.(bytes);
|
|||
|
|
}
|
|||
|
|
if (json.code === 20000000 && json.usage) {
|
|||
|
|
usage = json.usage;
|
|||
|
|
}
|
|||
|
|
} catch {}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return { usage, logid };
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============ 试用模式 ============
|
|||
|
|
|
|||
|
|
export async function synthesizeFreeV1(params, options = {}) {
|
|||
|
|
const {
|
|||
|
|
voiceKey = FREE_DEFAULT_VOICE,
|
|||
|
|
text,
|
|||
|
|
speed = 1.0,
|
|||
|
|
emotion = null,
|
|||
|
|
} = params || {};
|
|||
|
|
|
|||
|
|
if (!text) {
|
|||
|
|
throw new Error('缺少必要参数: text');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const requestBody = {
|
|||
|
|
voiceKey,
|
|||
|
|
text: String(text || ''),
|
|||
|
|
speed: Number(speed) || 1.0,
|
|||
|
|
uid: 'xb_' + Date.now(),
|
|||
|
|
reqid: crypto.randomUUID?.() || `${Date.now()}_${Math.random().toString(36).slice(2)}`,
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
if (emotion) {
|
|||
|
|
requestBody.emotion = emotion;
|
|||
|
|
requestBody.emotionScale = 5;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const res = await fetch(FREE_V1_URL, {
|
|||
|
|
method: 'POST',
|
|||
|
|
headers: { 'Content-Type': 'application/json' },
|
|||
|
|
body: JSON.stringify(requestBody),
|
|||
|
|
signal: options.signal,
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
if (!res.ok) throw new Error(`TTS HTTP ${res.status}`);
|
|||
|
|
|
|||
|
|
const data = await res.json();
|
|||
|
|
if (data.code !== 3000) throw new Error(data.message || 'TTS 合成失败');
|
|||
|
|
|
|||
|
|
return { audioBase64: data.data };
|
|||
|
|
}
|