Frequently Asked Questions (FAQ)
A REST API (Representational State Transfer Application Programming Interface) is an interface that allows external programs to access the functionality of an application. It enables communication via HTTP requests and responses, allowing you to automate tasks such as sending speech synthesis requests, checking their status, and retrieving results.
By enabling the REST API in ****Voisona Talk, you can control speech synthesis from various programming languages such as Python, C++, or JavaScript. This makes it possible to integrate Voisona Talk’s synthesized voices into your own applications, web services, chatbots, or games.
In the following tutorial, we’ll walk through how to enable the REST API in Voisona Talk and execute speech synthesis using Python.
*The REST API functionality is currently available as a beta version.
Below is the complete sample code. Each part of it is explained in the sections that follow.
import argparse
import json
import os
import sys
import time
import xml.etree.ElementTree as ET
import requests
parser = argparse.ArgumentParser()
parser.add_argument("--user", type=str, required=True, help="User name")
parser.add_argument("--password", type=str, required=True, help="API password")
parser.add_argument("--port", type=int, default=32766, help="Port number")
parser.add_argument("--output-wav", type=str, default="test.wav", help="WAV filename")
args = parser.parse_args()
auth = (args.user, args.password)
base_url = f"<http://localhost>:{args.port}/api/talk/v1/"
def get_voice_libraries():
response = requests.get(base_url + "voices", auth=auth)
response.raise_for_status()
voice_libraries = response.json()["items"]
print("The list of available voice libraries is shown below.")
print(json.dumps(voice_libraries, indent=2, ensure_ascii=False))
return voice_libraries
def synthesize_text(voice_library):
payload = {
"text": "こんにちは",
"language": voice_library["languages"][0],
"voice_name": voice_library["voice_name"],
"voice_version": voice_library["voice_version"],
"force_enqueue": True,
}
response = requests.post(base_url + "speech-syntheses", auth=auth, json=payload)
response.raise_for_status()
uuid = response.json()["uuid"]
print("Request sent successfully.")
return uuid
def check_status(uuid, synth=True, timeout=30):
start = time.time()
endpoint = "speech-syntheses" if synth else "text-analyses"
while True:
response = requests.get(base_url + endpoint + "/" + uuid, auth=auth)
response.raise_for_status()
state = response.json()["state"]
if state == "succeeded":
break
if time.time() - start > timeout:
raise TimeoutError("Processing took too long.")
time.sleep(0.1)
print("Request processing completed.")
return response
def delete_request(uuid):
response = requests.delete(base_url + "speech-syntheses/" + uuid, auth=auth)
response.raise_for_status()
print("Request processing completed.")
def synthesize_text_and_save(voice_library):
payload = {
"text": "こんにちは",
"language": voice_library["languages"][0],
"voice_name": voice_library["voice_name"],
"voice_version": voice_library["voice_version"],
"can_overwrite_file": True,
"destination": "file",
"output_file_path": os.path.abspath(args.output_wav),
}
response = requests.post(base_url + "speech-syntheses", auth=auth, json=payload)
response.raise_for_status()
uuid = response.json()["uuid"]
print("Request sent successfully.")
return uuid
def synthesize_text_with_global_parameters(voice_library):
payload = {
"text": "こんにちは",
"language": voice_library["languages"][0],
"voice_name": voice_library["voice_name"],
"voice_version": voice_library["voice_version"],
"global_parameters": {
"alp": 0.0,
"huskiness": 0.0,
"intonation": 1.0,
"pitch": 0.0,
"speed": 2.0,
"style_weights": [],
"volume": 0.0,
},
}
response = requests.post(base_url + "speech-syntheses", auth=auth, json=payload)
response.raise_for_status()
uuid = response.json()["uuid"]
print("Request sent successfully.")
return uuid
def analyze_text():
payload = {
"text": "こんにちは",
"language": "ja_JP",
}
response = requests.post(base_url + "text-analyses", auth=auth, json=payload)
uuid = response.json()["uuid"]
response = check_status(uuid, synth=False)
analyzed_text = response.json()["analyzed_text"]
print("Text analysis completed.")
print(analyzed_text)
return analyzed_text
def synthesize_text_with_analyzed_text(voice_library, analyzed_text):
root = ET.fromstring(analyzed_text)
word = root.find(".//word")
word.set("hl", "hllll")
word.set("pronunciation", "コンニチハ")
modified_analyzed_text = ET.tostring(root, encoding="unicode")
print(modified_analyzed_text)
payload = {
"analyzed_text": modified_analyzed_text,
"language": voice_library["languages"][0],
"voice_name": voice_library["voice_name"],
"voice_version": voice_library["voice_version"],
}
response = requests.post(base_url + "speech-syntheses", auth=auth, json=payload)
response.raise_for_status()
uuid = response.json()["uuid"]
print("Request sent successfully.")
return uuid
try:
voice_libraries = get_voice_libraries()
if len(voice_libraries) == 0:
print("Please download a voice library.")
sys.exit(1)
voice_library = voice_libraries[0]
uuid = synthesize_text(voice_library)
check_status(uuid)
delete_request(uuid)
uuid = synthesize_text_and_save(voice_library)
check_status(uuid)
time.sleep(2)# Wait for the previous audio playback to finish.
uuid = synthesize_text_with_global_parameters(voice_library)
check_status(uuid)
time.sleep(2)# Wait for the previous audio playback to finish.
analyzed_text = analyze_text()
uuid = synthesize_text_with_analyzed_text(voice_library, analyzed_text)
check_status(uuid)
print("The tutorial was completed without errors.")
except requests.exceptions.ConnectionError as e:
print("Failed to connect. Please check the server status and configuration.")
print(e)
except requests.exceptions.HTTPError as e:
print("An HTTP error occurred.")
print(e)
except Exception as e:
print("An unexpected error occurred.")
print(e)
To run the sample code, you need to install the requests package:
pip install requests
Example of running the sample:
python sample.py --user [email protected] --password 1234
Replace the username and password according to your API settings.
You can get the list of installed voice libraries as follows:
auth = (args.user, args.password)
base_url = f"<http://localhost>:{args.port}/api/talk/v1/"
def get_voice_libraries():
response = requests.get(base_url + "voices", auth=auth)
response.raise_for_status()
voice_libraries = response.json()["items"]
print("The list of available voice libraries is shown below.")
print(json.dumps(voice_libraries, indent=2, ensure_ascii=False))
return voice_libraries
Example output:
[
{
"display_names": [
{
"language": "ja_JP",
"name": "田中傘"
},
{
"language": "en_US",
"name": "Tanaka San"
}
],
"languages": [
"ja_JP"
],
"voice_name": "tanaka-san_ja_JP",
"voice_version": "2.0.0"
}
]
If no voice library is downloaded in the editor, the result will be empty.
The following sends a speech synthesis request to the API server:
def synthesize_text(voice_library):
payload = {
"text": "こんにちは",
"language": voice_library["languages"][0],
"voice_name": voice_library["voice_name"],
"voice_version": voice_library["voice_version"],
"force_enqueue": True,
}
response = requests.post(base_url + "speech-syntheses", auth=auth, json=payload)
response.raise_for_status()
uuid = response.json()["uuid"]
print("Request sent successfully.")
return uuid
In the sample code, the first detected voice library from the retrieved list is used as the voice library for synthesis.
When synthesis completes, you will hear “こんにちは” played through your default audio device.
Note that the server has a limit on the number of requests. If the limit is reached, requests may fail. Setting force_enqueue: true automatically removes older requests to make room for new ones.
You can check the status of the submitted request using the UUID obtained by executing the synthesize_text function.
def check_status(uuid, timeout=30):
start = time.time()
while True:
response = requests.get(base_url + "speech-syntheses/" + uuid, auth=auth)
response.raise_for_status()
state = response.json()["state"]
if state == "succeeded":
break
if time.time() - start > timeout:
raise TimeoutError("Processing took too long.")
time.sleep(0.1)
print("Request processing completed.")
return response
If state is queued, the request is waiting to be processed. If it’s succeeded, synthesis has completed successfully.
You can also delete a request by specifying its UUID:
def delete_request(uuid):
response = requests.delete(base_url + "speech-syntheses/" + uuid, auth=auth)
response.raise_for_status()
print("Request deleted successfully.")
To save the synthesized result as a file instead of playing it, specify an absolute path:
def synthesize_text_and_save(voice_library):
payload = {
"text": "こんにちは",
"language": voice_library["languages"][0],
"voice_name": voice_library["voice_name"],
"voice_version": voice_library["voice_version"],
"can_overwrite_file": True,
"destination": "file",
"output_file_path": os.path.abspath(args.output_wav),
}
response = requests.post(base_url + "speech-syntheses", auth=auth, json=payload)
response.raise_for_status()
print("Request sent successfully.")
To modify voice expression, include a global_parameters object. For example, to double the speaking speed:
def synthesize_text_with_global_parameters(voice_library):
payload = {
"text": "こんにちは",
"language": voice_library["languages"][0],
"voice_name": voice_library["voice_name"],
"voice_version": voice_library["voice_version"],
"global_parameters": {
"alp": 0.0,
"huskiness": 0.0,
"intonation": 1.0,
"pitch": 0.0,
"speed": 2.0,
"style_weights": [],
"volume": 0.0,
},
}
response = requests.post(base_url + "speech-syntheses", auth=auth, json=payload)
response.raise_for_status()
print("Request sent successfully.")
You can perform fine-grained control by modifying analyzed text data. First, send a text analysis request:
def analyze_text():
payload = {
"text": "こんにちは",
"language": "ja_JP",
}
response = requests.post(base_url + "text-analyses", auth=auth, json=payload)
uuid = response.json()["uuid"]
response = check_status(uuid, synth=False)
analyzed_text = response.json()["analyzed_text"]
print("Text analysis completed.")
print(analyzed_text)
return analyzed_text
Example response:
<tsml><acoustic_phrase><word chain="0" hl="lhhhh" original="こんにちは" phoneme="k,o|N|n,i|ch,i|w,a" pos="感動詞" pronunciation="コ
ンニチワ">こんにちは</word></acoustic_phrase></tsml>
By modifying the analyzed text and sending it back to the server, you can adjust the accent and pronunciation. The following is an example using an XML parser.
def synthesize_text_with_analyzed_text(voice_library, analyzed_text):
root = ET.fromstring(analyzed_text)
word = root.find(".//word")
word.set("hl", "hllll")
word.set("pronunciation", "コンニチハ")
modified_analyzed_text = ET.tostring(root, encoding="unicode")
print(modified_analyzed_text)
payload = {
"analyzed_text": modified_analyzed_text,
"language": voice_library["languages"][0],
"voice_name": voice_library["voice_name"],
"voice_version": voice_library["voice_version"],
}
response = requests.post(base_url + "speech-syntheses", auth=auth, json=payload)
response.raise_for_status()
print("Request sent successfully.")
For more details, please refer to the “Talk API Reference” via the link in the API tab of the Preferences window in the Voisona Talk editor.