//Copyright 2022 Ruel Tmeizeh - All Rights Reserved package main import ( "bufio" "context" "encoding/binary" "flag" "fmt" "io" "io/ioutil" "log" "os" texttospeech "cloud.google.com/go/texttospeech/apiv1" texttospeechpb "google.golang.org/genproto/googleapis/cloud/texttospeech/v1" ) type CommandlineOptions struct { ListVoices *bool `json:"listvoices,omitempty"` Ssml *bool `json:"ssml,omitempty"` Output *string `json:"output,omitempty"` Input *string `json:"input,omitempty"` Language *string `json:"language,omitempty"` Gender *string `json:"gender,omitempty"` Voice *string `json:"voice,omitempty"` Format *string `json:"format,omitempty"` Speed *float64 `json:"speed,omitempty"` Pitch *float64 `json:"pitch,omitempty"` SampleRate *int `json:"samplerate,omitempty"` VolumeGain *float64 `json:"volume,omitempty"` } func main() { //check commandline args: opts := &CommandlineOptions{ ListVoices: flag.Bool("listvoices", false, "List available voices, rather than generate TTS. Use in\ncombination with '-l ALL' to show voices from all languages."), Ssml: flag.Bool("ssml", false, "Input is SSML format, rather than plain text."), Input: flag.String("i", "-", "Input file path. Defaults to stdin.\n"), Output: flag.String("o", "./tts.mp3", "Output file path. Use '-' for stdout.\n"), Language: flag.String("l", "en-US", "Language selection. 'en-US', 'en-GB', 'en-AU', 'en-IN',\n'el-GR', 'ru-RU', etc.\n"), Gender: flag.String("g", "m", "Gender selection. [m,f,n] 'n' means neutral/don't care.\n"), Format: flag.String("f", "mp3", "Audio format selection. PCM is uncompressed best quality. Opus is\nexcellent quality. MP3 is 32kb bitrate. [pcm,opus,mp3,ulaw,alaw]\n"), Voice: flag.String("v", "unspecified", "Voice. If specified, this overrides language & gender.\n"), Speed: flag.Float64("s", 1.0, "Speed. E.g. '1.0' is normal. '2.0' is double\nspeed, '0.25' is quarter speed, etc.\n"), Pitch: flag.Float64("p", 0.0, "Pitch. E.g. '0.0' is normal. '20.0' is highest,\n'-20.0' is lowest.\n (default 0)"), SampleRate: flag.Int("r", 24000, "Samplerate in Hz. [8000,11025,16000,22050,24000,32000,44100,48000]\n"), VolumeGain: flag.Float64("-db", 0.0, "Volume gain in dB. [-96 to 16]\n (default 0)"), } flag.Parse() var audioFormat texttospeechpb.AudioEncoding var fileExtension string switch *opts.Format { case "mp3": audioFormat = texttospeechpb.AudioEncoding_MP3 fileExtension = "mp3" case "opus": audioFormat = texttospeechpb.AudioEncoding_OGG_OPUS fileExtension = "ogg" case "ogg": audioFormat = texttospeechpb.AudioEncoding_OGG_OPUS fileExtension = "ogg" case "pcm": audioFormat = texttospeechpb.AudioEncoding_LINEAR16 fileExtension = "pcm" case "ulaw": audioFormat = texttospeechpb.AudioEncoding_MULAW fileExtension = "ulaw" case "alaw": audioFormat = texttospeechpb.AudioEncoding_ALAW fileExtension = "alaw" default: audioFormat = texttospeechpb.AudioEncoding_MP3 fileExtension = "mp3" } filename := "tts." + fileExtension if *opts.Output != "./tts.mp3" { filename = *opts.Output } /////////////////////////////////////// //Instantiates a Google Cloud client ctx := context.Background() client, err := texttospeech.NewClient(ctx) if err != nil { log.Fatal(err) } defer client.Close() if *opts.ListVoices { fmt.Println("Available Voices:") bufStdout := bufio.NewWriter(os.Stdout) listVoices(bufStdout, ctx, client, *opts.Language) bufStdout.Flush() os.Exit(0) } var inputFile *os.File if *opts.Input == "-" { //read input from stdin inputFile = os.Stdin } else { //read input from file var err error inputFile, err = os.Open(*opts.Input) if err != nil { log.Fatal(err) } defer inputFile.Close() } var input string scanner := bufio.NewScanner(inputFile) for scanner.Scan() { //fmt.Println(scanner.Text()) input = input + scanner.Text() } //Start building TTS request things synthInput := &texttospeechpb.SynthesisInput{} synthInput.InputSource = &texttospeechpb.SynthesisInput_Text{Text: input} if *opts.Ssml { synthInput.InputSource = &texttospeechpb.SynthesisInput_Ssml{Ssml: input} } //Voice Gender var gender texttospeechpb.SsmlVoiceGender switch *opts.Gender { case "m": gender = texttospeechpb.SsmlVoiceGender_MALE case "f": gender = texttospeechpb.SsmlVoiceGender_FEMALE default: gender = texttospeechpb.SsmlVoiceGender_NEUTRAL } voice := &texttospeechpb.VoiceSelectionParams{ LanguageCode: *opts.Language, SsmlGender: gender, //Name: *opts.Voice, //Name overrides LanguageCode and SsmlGender //Name: "en-US-Wavenet-B", } if *opts.Voice != "unspecified" { voice.Name = *opts.Voice } //the request parameters req := texttospeechpb.SynthesizeSpeechRequest{ Input: synthInput, Voice: voice, AudioConfig: &texttospeechpb.AudioConfig{ AudioEncoding: audioFormat, SpeakingRate: *opts.Speed, SampleRateHertz: int32(*opts.SampleRate), Pitch: *opts.Pitch, VolumeGainDb: *opts.VolumeGain, }, } resp, err := client.SynthesizeSpeech(ctx, &req) if err != nil { log.Fatal(err) } if *opts.Output == "-" { //write to stdout //binary.Write(os.Stdout, binary.LittleEndian, resp.AudioContent) bufStdout := bufio.NewWriter(os.Stdout) //add a buffer defer bufStdout.Flush() binary.Write(bufStdout, binary.LittleEndian, resp.AudioContent) } else { //write to file err = ioutil.WriteFile(filename, resp.AudioContent, 0644) if err != nil { log.Fatal(err) } fmt.Printf("Audio content written to file: %v\n", filename) } } func listVoices(w io.Writer, ctx context.Context, client *texttospeech.Client, lang string) error { resp, err := client.ListVoices(ctx, &texttospeechpb.ListVoicesRequest{}) if err != nil { return err } for _, voice := range resp.Voices { for _, languageCode := range voice.LanguageCodes { if lang == languageCode || lang == "ALL" { fmt.Fprintln(w, "___________________________________") fmt.Fprintf(w, "Name: %v\n", voice.Name) fmt.Fprintf(w, " Language: %v\n", languageCode) fmt.Fprintf(w, " Gender: %v\n", voice.SsmlGender.String()) fmt.Fprintf(w, " Native Sample Rate (in Hz): %v\n", voice.NaturalSampleRateHertz) } } } fmt.Fprintln(w, "------------------------------------") return nil }