You've heard of Google Chat, Google Voice, Google Talk? Well, this is Google Squawk! :-D It's a commandline application that connects to the Google Cloud TTS API and generates audio from text.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

203 lines
6.3 KiB

//Copyright 2022 Ruel Tmeizeh - All Rights Reserved
package main
import (
"bufio"
"context"
"encoding/binary"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"os"
texttospeech "cloud.google.com/go/texttospeech/apiv1"
texttospeechpb "google.golang.org/genproto/googleapis/cloud/texttospeech/v1"
)
type CommandlineOptions struct {
ListVoices *bool `json:"listvoices,omitempty"`
Ssml *bool `json:"ssml,omitempty"`
Output *string `json:"output,omitempty"`
Input *string `json:"input,omitempty"`
Language *string `json:"language,omitempty"`
Gender *string `json:"gender,omitempty"`
Voice *string `json:"voice,omitempty"`
Format *string `json:"format,omitempty"`
Speed *float64 `json:"speed,omitempty"`
Pitch *float64 `json:"pitch,omitempty"`
SampleRate *int `json:"samplerate,omitempty"`
VolumeGain *float64 `json:"volume,omitempty"`
}
func main() {
//check commandline args:
opts := &CommandlineOptions{
ListVoices: flag.Bool("listvoices", false, "List available voices, rather than generate TTS. Use in\ncombination with '-l ALL' to show voices from all languages."),
Ssml: flag.Bool("ssml", false, "Input is SSML format, rather than plain text."),
Input: flag.String("i", "-", "Input file path. Defaults to stdin.\n"),
Output: flag.String("o", "./tts.mp3", "Output file path. Use '-' for stdout.\n"),
Language: flag.String("l", "en-US", "Language selection. 'en-US', 'en-GB', 'en-AU', 'en-IN',\n'el-GR', 'ru-RU', etc.\n"),
Gender: flag.String("g", "m", "Gender selection. [m,f,n] 'n' means neutral/don't care.\n"),
Format: flag.String("f", "mp3", "Audio format selection. PCM is uncompressed best quality. Opus is\nexcellent quality. MP3 is 32kb bitrate. [pcm,opus,mp3,ulaw,alaw]\n"),
Voice: flag.String("v", "unspecified", "Voice. If specified, this overrides language & gender.\n"),
Speed: flag.Float64("s", 1.0, "Speed. E.g. '1.0' is normal. '2.0' is double\nspeed, '0.25' is quarter speed, etc.\n"),
Pitch: flag.Float64("p", 0.0, "Pitch. E.g. '0.0' is normal. '20.0' is highest,\n'-20.0' is lowest.\n (default 0)"),
SampleRate: flag.Int("r", 24000, "Samplerate in Hz. [8000,11025,16000,22050,24000,32000,44100,48000]\n"),
VolumeGain: flag.Float64("-db", 0.0, "Volume gain in dB. [-96 to 16]\n (default 0)"),
}
flag.Parse()
var audioFormat texttospeechpb.AudioEncoding
var fileExtension string
switch *opts.Format {
case "mp3":
audioFormat = texttospeechpb.AudioEncoding_MP3
fileExtension = "mp3"
case "opus":
audioFormat = texttospeechpb.AudioEncoding_OGG_OPUS
fileExtension = "ogg"
case "ogg":
audioFormat = texttospeechpb.AudioEncoding_OGG_OPUS
fileExtension = "ogg"
case "pcm":
audioFormat = texttospeechpb.AudioEncoding_LINEAR16
fileExtension = "pcm"
case "ulaw":
audioFormat = texttospeechpb.AudioEncoding_MULAW
fileExtension = "ulaw"
case "alaw":
audioFormat = texttospeechpb.AudioEncoding_ALAW
fileExtension = "alaw"
default:
audioFormat = texttospeechpb.AudioEncoding_MP3
fileExtension = "mp3"
}
filename := "tts." + fileExtension
if *opts.Output != "./tts.mp3" {
filename = *opts.Output
}
///////////////////////////////////////
//Instantiates a Google Cloud client
ctx := context.Background()
client, err := texttospeech.NewClient(ctx)
if err != nil {
log.Fatal(err)
}
defer client.Close()
if *opts.ListVoices {
fmt.Println("Available Voices:")
bufStdout := bufio.NewWriter(os.Stdout)
listVoices(bufStdout, ctx, client, *opts.Language)
bufStdout.Flush()
os.Exit(0)
}
var inputFile *os.File
if *opts.Input == "-" {
//read input from stdin
inputFile = os.Stdin
} else {
//read input from file
var err error
inputFile, err = os.Open(*opts.Input)
if err != nil {
log.Fatal(err)
}
defer inputFile.Close()
}
var input string
scanner := bufio.NewScanner(inputFile)
for scanner.Scan() {
//fmt.Println(scanner.Text())
input = input + scanner.Text()
}
//Start building TTS request things
synthInput := &texttospeechpb.SynthesisInput{}
synthInput.InputSource = &texttospeechpb.SynthesisInput_Text{Text: input}
if *opts.Ssml {
synthInput.InputSource = &texttospeechpb.SynthesisInput_Ssml{Ssml: input}
}
//Voice Gender
var gender texttospeechpb.SsmlVoiceGender
switch *opts.Gender {
case "m":
gender = texttospeechpb.SsmlVoiceGender_MALE
case "f":
gender = texttospeechpb.SsmlVoiceGender_FEMALE
default:
gender = texttospeechpb.SsmlVoiceGender_NEUTRAL
}
voice := &texttospeechpb.VoiceSelectionParams{
LanguageCode: *opts.Language,
SsmlGender: gender,
//Name: *opts.Voice, //Name overrides LanguageCode and SsmlGender
//Name: "en-US-Wavenet-B",
}
if *opts.Voice != "unspecified" {
voice.Name = *opts.Voice
}
//the request parameters
req := texttospeechpb.SynthesizeSpeechRequest{
Input: synthInput,
Voice: voice,
AudioConfig: &texttospeechpb.AudioConfig{
AudioEncoding: audioFormat,
SpeakingRate: *opts.Speed,
SampleRateHertz: int32(*opts.SampleRate),
Pitch: *opts.Pitch,
VolumeGainDb: *opts.VolumeGain,
},
}
resp, err := client.SynthesizeSpeech(ctx, &req)
if err != nil {
log.Fatal(err)
}
if *opts.Output == "-" { //write to stdout
//binary.Write(os.Stdout, binary.LittleEndian, resp.AudioContent)
bufStdout := bufio.NewWriter(os.Stdout) //add a buffer
defer bufStdout.Flush()
binary.Write(bufStdout, binary.LittleEndian, resp.AudioContent)
} else { //write to file
err = ioutil.WriteFile(filename, resp.AudioContent, 0644)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Audio content written to file: %v\n", filename)
}
}
func listVoices(w io.Writer, ctx context.Context, client *texttospeech.Client, lang string) error {
resp, err := client.ListVoices(ctx, &texttospeechpb.ListVoicesRequest{})
if err != nil {
return err
}
for _, voice := range resp.Voices {
for _, languageCode := range voice.LanguageCodes {
if lang == languageCode || lang == "ALL" {
fmt.Fprintln(w, "___________________________________")
fmt.Fprintf(w, "Name: %v\n", voice.Name)
fmt.Fprintf(w, " Language: %v\n", languageCode)
fmt.Fprintf(w, " Gender: %v\n", voice.SsmlGender.String())
fmt.Fprintf(w, " Native Sample Rate (in Hz): %v\n", voice.NaturalSampleRateHertz)
}
}
}
fmt.Fprintln(w, "------------------------------------")
return nil
}