|
|
//Copyright 2022 Ruel Tmeizeh - All Rights Reserved
|
|
|
package main
|
|
|
|
|
|
import (
|
|
|
"bufio"
|
|
|
"context"
|
|
|
"encoding/binary"
|
|
|
"flag"
|
|
|
"fmt"
|
|
|
"io"
|
|
|
"io/ioutil"
|
|
|
"log"
|
|
|
"os"
|
|
|
|
|
|
texttospeech "cloud.google.com/go/texttospeech/apiv1"
|
|
|
texttospeechpb "google.golang.org/genproto/googleapis/cloud/texttospeech/v1"
|
|
|
)
|
|
|
|
|
|
type CommandlineOptions struct {
|
|
|
ListVoices *bool `json:"listvoices,omitempty"`
|
|
|
Ssml *bool `json:"ssml,omitempty"`
|
|
|
Output *string `json:"output,omitempty"`
|
|
|
Input *string `json:"input,omitempty"`
|
|
|
Language *string `json:"language,omitempty"`
|
|
|
Gender *string `json:"gender,omitempty"`
|
|
|
Voice *string `json:"voice,omitempty"`
|
|
|
Format *string `json:"format,omitempty"`
|
|
|
Speed *float64 `json:"speed,omitempty"`
|
|
|
Pitch *float64 `json:"pitch,omitempty"`
|
|
|
SampleRate *int `json:"samplerate,omitempty"`
|
|
|
VolumeGain *float64 `json:"volume,omitempty"`
|
|
|
}
|
|
|
|
|
|
func main() {
|
|
|
//check commandline args:
|
|
|
opts := &CommandlineOptions{
|
|
|
ListVoices: flag.Bool("listvoices", false, "List available voices, rather than generate TTS. Use in\ncombination with '-l ALL' to show voices from all languages."),
|
|
|
Ssml: flag.Bool("ssml", false, "Input is SSML format, rather than plain text."),
|
|
|
Input: flag.String("i", "-", "Input file path. Defaults to stdin.\n"),
|
|
|
Output: flag.String("o", "./tts.mp3", "Output file path. Use '-' for stdout.\n"),
|
|
|
Language: flag.String("l", "en-US", "Language selection. 'en-US', 'en-GB', 'en-AU', 'en-IN',\n'el-GR', 'ru-RU', etc.\n"),
|
|
|
Gender: flag.String("g", "m", "Gender selection. [m,f,n] 'n' means neutral/don't care.\n"),
|
|
|
Format: flag.String("f", "mp3", "Audio format selection. PCM is uncompressed best quality. Opus is\nexcellent quality. MP3 is 32kb bitrate. [pcm,opus,mp3,ulaw,alaw]\n"),
|
|
|
Voice: flag.String("v", "unspecified", "Voice. If specified, this overrides language & gender.\n"),
|
|
|
Speed: flag.Float64("s", 1.0, "Speed. E.g. '1.0' is normal. '2.0' is double\nspeed, '0.25' is quarter speed, etc.\n"),
|
|
|
Pitch: flag.Float64("p", 0.0, "Pitch. E.g. '0.0' is normal. '20.0' is highest,\n'-20.0' is lowest.\n (default 0)"),
|
|
|
SampleRate: flag.Int("r", 24000, "Samplerate in Hz. [8000,11025,16000,22050,24000,32000,44100,48000]\n"),
|
|
|
VolumeGain: flag.Float64("-db", 0.0, "Volume gain in dB. [-96 to 16]\n (default 0)"),
|
|
|
}
|
|
|
flag.Parse()
|
|
|
|
|
|
var audioFormat texttospeechpb.AudioEncoding
|
|
|
var fileExtension string
|
|
|
switch *opts.Format {
|
|
|
case "mp3":
|
|
|
audioFormat = texttospeechpb.AudioEncoding_MP3
|
|
|
fileExtension = "mp3"
|
|
|
case "opus":
|
|
|
audioFormat = texttospeechpb.AudioEncoding_OGG_OPUS
|
|
|
fileExtension = "ogg"
|
|
|
case "ogg":
|
|
|
audioFormat = texttospeechpb.AudioEncoding_OGG_OPUS
|
|
|
fileExtension = "ogg"
|
|
|
case "pcm":
|
|
|
audioFormat = texttospeechpb.AudioEncoding_LINEAR16
|
|
|
fileExtension = "pcm"
|
|
|
case "ulaw":
|
|
|
audioFormat = texttospeechpb.AudioEncoding_MULAW
|
|
|
fileExtension = "ulaw"
|
|
|
case "alaw":
|
|
|
audioFormat = texttospeechpb.AudioEncoding_ALAW
|
|
|
fileExtension = "alaw"
|
|
|
default:
|
|
|
audioFormat = texttospeechpb.AudioEncoding_MP3
|
|
|
fileExtension = "mp3"
|
|
|
}
|
|
|
|
|
|
filename := "tts." + fileExtension
|
|
|
if *opts.Output != "./tts.mp3" {
|
|
|
filename = *opts.Output
|
|
|
}
|
|
|
|
|
|
///////////////////////////////////////
|
|
|
//Instantiates a Google Cloud client
|
|
|
ctx := context.Background()
|
|
|
client, err := texttospeech.NewClient(ctx)
|
|
|
if err != nil {
|
|
|
log.Fatal(err)
|
|
|
}
|
|
|
defer client.Close()
|
|
|
|
|
|
if *opts.ListVoices {
|
|
|
fmt.Println("Available Voices:")
|
|
|
bufStdout := bufio.NewWriter(os.Stdout)
|
|
|
listVoices(bufStdout, ctx, client, *opts.Language)
|
|
|
bufStdout.Flush()
|
|
|
os.Exit(0)
|
|
|
}
|
|
|
|
|
|
var inputFile *os.File
|
|
|
if *opts.Input == "-" {
|
|
|
//read input from stdin
|
|
|
inputFile = os.Stdin
|
|
|
} else {
|
|
|
//read input from file
|
|
|
var err error
|
|
|
inputFile, err = os.Open(*opts.Input)
|
|
|
if err != nil {
|
|
|
log.Fatal(err)
|
|
|
}
|
|
|
defer inputFile.Close()
|
|
|
}
|
|
|
|
|
|
var input string
|
|
|
|
|
|
scanner := bufio.NewScanner(inputFile)
|
|
|
for scanner.Scan() {
|
|
|
//fmt.Println(scanner.Text())
|
|
|
input = input + scanner.Text()
|
|
|
}
|
|
|
|
|
|
//Start building TTS request things
|
|
|
synthInput := &texttospeechpb.SynthesisInput{}
|
|
|
synthInput.InputSource = &texttospeechpb.SynthesisInput_Text{Text: input}
|
|
|
if *opts.Ssml {
|
|
|
synthInput.InputSource = &texttospeechpb.SynthesisInput_Ssml{Ssml: input}
|
|
|
}
|
|
|
|
|
|
//Voice Gender
|
|
|
var gender texttospeechpb.SsmlVoiceGender
|
|
|
switch *opts.Gender {
|
|
|
case "m":
|
|
|
gender = texttospeechpb.SsmlVoiceGender_MALE
|
|
|
case "f":
|
|
|
gender = texttospeechpb.SsmlVoiceGender_FEMALE
|
|
|
default:
|
|
|
gender = texttospeechpb.SsmlVoiceGender_NEUTRAL
|
|
|
}
|
|
|
|
|
|
voice := &texttospeechpb.VoiceSelectionParams{
|
|
|
LanguageCode: *opts.Language,
|
|
|
SsmlGender: gender,
|
|
|
//Name: *opts.Voice, //Name overrides LanguageCode and SsmlGender
|
|
|
//Name: "en-US-Wavenet-B",
|
|
|
}
|
|
|
if *opts.Voice != "unspecified" {
|
|
|
voice.Name = *opts.Voice
|
|
|
}
|
|
|
|
|
|
//the request parameters
|
|
|
req := texttospeechpb.SynthesizeSpeechRequest{
|
|
|
Input: synthInput,
|
|
|
Voice: voice,
|
|
|
AudioConfig: &texttospeechpb.AudioConfig{
|
|
|
AudioEncoding: audioFormat,
|
|
|
SpeakingRate: *opts.Speed,
|
|
|
SampleRateHertz: int32(*opts.SampleRate),
|
|
|
Pitch: *opts.Pitch,
|
|
|
VolumeGainDb: *opts.VolumeGain,
|
|
|
},
|
|
|
}
|
|
|
|
|
|
resp, err := client.SynthesizeSpeech(ctx, &req)
|
|
|
if err != nil {
|
|
|
log.Fatal(err)
|
|
|
}
|
|
|
|
|
|
if *opts.Output == "-" { //write to stdout
|
|
|
//binary.Write(os.Stdout, binary.LittleEndian, resp.AudioContent)
|
|
|
bufStdout := bufio.NewWriter(os.Stdout) //add a buffer
|
|
|
defer bufStdout.Flush()
|
|
|
binary.Write(bufStdout, binary.LittleEndian, resp.AudioContent)
|
|
|
} else { //write to file
|
|
|
err = ioutil.WriteFile(filename, resp.AudioContent, 0644)
|
|
|
if err != nil {
|
|
|
log.Fatal(err)
|
|
|
}
|
|
|
fmt.Printf("Audio content written to file: %v\n", filename)
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
func listVoices(w io.Writer, ctx context.Context, client *texttospeech.Client, lang string) error {
|
|
|
resp, err := client.ListVoices(ctx, &texttospeechpb.ListVoicesRequest{})
|
|
|
if err != nil {
|
|
|
return err
|
|
|
}
|
|
|
|
|
|
for _, voice := range resp.Voices {
|
|
|
for _, languageCode := range voice.LanguageCodes {
|
|
|
if lang == languageCode || lang == "ALL" {
|
|
|
fmt.Fprintln(w, "___________________________________")
|
|
|
fmt.Fprintf(w, "Name: %v\n", voice.Name)
|
|
|
fmt.Fprintf(w, " Language: %v\n", languageCode)
|
|
|
fmt.Fprintf(w, " Gender: %v\n", voice.SsmlGender.String())
|
|
|
fmt.Fprintf(w, " Native Sample Rate (in Hz): %v\n", voice.NaturalSampleRateHertz)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
fmt.Fprintln(w, "------------------------------------")
|
|
|
|
|
|
return nil
|
|
|
}
|