ChatTTS-Forge_English_interface

Sleeping

ChatTTS-Forge_English_interface / modules /api /impl /openai_api.py

zhzluke96

update

d2b7e94 over 1 year ago

5.37 kB

	from typing import List, Optional

	from fastapi import Body, File, Form, HTTPException, UploadFile
	from fastapi.responses import StreamingResponse
	from numpy import clip
	from pydantic import BaseModel, Field

	from modules.api import utils as api_utils
	from modules.api.Api import APIManager
	from modules.api.impl.handler.TTSHandler import TTSHandler
	from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
	from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
	from modules.api.impl.model.enhancer_model import EnhancerConfig
	from modules.data import styles_mgr
	from modules.speaker import Speaker, speaker_mgr


	class AudioSpeechRequest(BaseModel):
	input: str # 需要合成的文本
	model: str = "chattts-4w"
	voice: str = "female2"
	response_format: AudioFormat = "mp3"
	speed: float = Field(1, ge=0.1, le=10, description="Speed of the audio")
	seed: int = 42

	temperature: float = 0.3
	top_k: int = 20
	top_p: float = 0.7

	style: str = ""
	batch_size: int = Field(1, ge=1, le=20, description="Batch size")
	spliter_threshold: float = Field(
	100, ge=10, le=1024, description="Threshold for sentence spliter"
	)
	# end of sentence
	eos: str = "[uv_break]"

	enhance: bool = False
	denoise: bool = False


	async def openai_speech_api(
	request: AudioSpeechRequest = Body(
	..., description="JSON body with model, input text, and voice"
	)
	):
	model = request.model
	input_text = request.input
	voice = request.voice
	style = request.style
	eos = request.eos
	seed = request.seed

	response_format = request.response_format
	if not isinstance(response_format, AudioFormat) and isinstance(
	response_format, str
	):
	response_format = AudioFormat(response_format)

	batch_size = request.batch_size
	spliter_threshold = request.spliter_threshold
	speed = request.speed
	speed = clip(speed, 0.1, 10)

	if not input_text:
	raise HTTPException(status_code=400, detail="Input text is required.")
	if speaker_mgr.get_speaker(voice) is None:
	raise HTTPException(status_code=400, detail="Invalid voice.")
	try:
	if style:
	styles_mgr.find_item_by_name(style)
	except:
	raise HTTPException(status_code=400, detail="Invalid style.")

	ctx_params = api_utils.calc_spk_style(spk=voice, style=style)

	speaker = ctx_params.get("spk")
	if not isinstance(speaker, Speaker):
	raise HTTPException(status_code=400, detail="Invalid voice.")

	tts_config = ChatTTSConfig(
	style=style,
	temperature=request.temperature,
	top_k=request.top_k,
	top_p=request.top_p,
	)
	infer_config = InferConfig(
	batch_size=batch_size,
	spliter_threshold=spliter_threshold,
	eos=eos,
	seed=seed,
	)
	adjust_config = AdjustConfig(speaking_rate=speed)
	enhancer_config = EnhancerConfig(
	enabled=request.enhance or request.denoise or False,
	lambd=0.9 if request.denoise else 0.1,
	)
	try:
	handler = TTSHandler(
	text_content=input_text,
	spk=speaker,
	tts_config=tts_config,
	infer_config=infer_config,
	adjust_config=adjust_config,
	enhancer_config=enhancer_config,
	)

	buffer = handler.enqueue_to_buffer(response_format)

	mime_type = f"audio/{response_format.value}"
	if response_format == AudioFormat.mp3:
	mime_type = "audio/mpeg"
	return StreamingResponse(buffer, media_type=mime_type)

	except Exception as e:
	import logging

	logging.exception(e)

	if isinstance(e, HTTPException):
	raise e
	else:
	raise HTTPException(status_code=500, detail=str(e))


	class TranscribeSegment(BaseModel):
	id: int
	seek: float
	start: float
	end: float
	text: str
	tokens: list[int]
	temperature: float
	avg_logprob: float
	compression_ratio: float
	no_speech_prob: float


	class TranscriptionsVerboseResponse(BaseModel):
	task: str
	language: str
	duration: float
	text: str
	segments: list[TranscribeSegment]


	def setup(app: APIManager):
	app.post(
	"/v1/audio/speech",
	description="""
	openai api document:
	[https://platform.openai.com/docs/guides/text-to-speech](https://platform.openai.com/docs/guides/text-to-speech)

	以下属性为本系统自定义属性，不在openai文档中：
	- batch_size: 是否开启batch合成，小于等于1表示不使用batch （不推荐）
	- spliter_threshold: 开启batch合成时，句子分割的阈值
	- style: 风格

	> model 可填任意值
	""",
	)(openai_speech_api)

	@app.post(
	"/v1/audio/transcriptions",
	response_model=TranscriptionsVerboseResponse,
	description="Transcribes audio into the input language.",
	)
	async def transcribe(
	file: UploadFile = File(...),
	model: str = Form(...),
	language: Optional[str] = Form(None),
	prompt: Optional[str] = Form(None),
	response_format: str = Form("json"),
	temperature: float = Form(0),
	timestamp_granularities: List[str] = Form(["segment"]),
	):
	# TODO: Implement transcribe
	return api_utils.success_response("not implemented yet")