From a7bb029d23bd5cc3a1dd06bfdddb62c8ef82980a Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 22 Jun 2023 17:53:10 +0200 Subject: [PATCH] feat: add tts with go-piper (#649) Signed-off-by: mudler --- Dockerfile | 33 ++++++++++++-- Makefile | 33 ++++++++++++-- api/api.go | 5 +++ api/localai.go | 78 +++++++++++++++++++++++++++++++++ api/options.go | 7 +++ entrypoint.sh | 2 +- go.mod | 3 +- main.go | 9 +++- pkg/gallery/models.go | 22 ++-------- pkg/model/initializers.go | 10 +++++ pkg/tts/generate.go | 12 +++++ pkg/tts/generate_unsupported.go | 10 +++++ pkg/tts/piper.go | 20 +++++++++ pkg/utils/path.go | 22 ++++++++++ 14 files changed, 237 insertions(+), 29 deletions(-) create mode 100644 api/localai.go create mode 100644 pkg/tts/generate.go create mode 100644 pkg/tts/generate_unsupported.go create mode 100644 pkg/tts/piper.go create mode 100644 pkg/utils/path.go diff --git a/Dockerfile b/Dockerfile index 854186b..bb07b53 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,8 +5,13 @@ FROM golang:$GO_VERSION as requirements ARG BUILD_TYPE ARG CUDA_MAJOR_VERSION=11 ARG CUDA_MINOR_VERSION=7 +ARG SPDLOG_VERSION="1.11.0" +ARG PIPER_PHONEMIZE_VERSION='1.0.0' +ARG TARGETARCH +ARG TARGETVARIANT ENV BUILD_TYPE=${BUILD_TYPE} +ARG GO_TAGS="stablediffusion tts" RUN apt-get update && \ apt-get install -y ca-certificates cmake curl patch @@ -23,6 +28,8 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \ ; fi ENV PATH /usr/local/cuda/bin:${PATH} +WORKDIR /build + # OpenBLAS requirements RUN apt-get install -y libopenblas-dev @@ -30,19 +37,37 @@ RUN apt-get install -y libopenblas-dev RUN apt-get install -y libopencv-dev && \ ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2 +# piper requirements +# Use pre-compiled Piper phonemization library (includes onnxruntime) +#RUN if echo "${GO_TAGS}" | grep -q "tts"; then \ +RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz" | \ + tar -xzvf - && \ + mkdir -p "spdlog-${SPDLOG_VERSION}/build" && \ + cd "spdlog-${SPDLOG_VERSION}/build" && \ + cmake .. && \ + make -j8 && \ + cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \ + cd /build && \ + mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \ + curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH}${TARGETVARIANT}.tar.gz" | \ + tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \ + cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /lib64/ && \ + cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \ + cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/ +# \ +# ; fi + FROM requirements as builder -ARG GO_TAGS=stablediffusion +ARG GO_TAGS="stablediffusion tts" ENV GO_TAGS=${GO_TAGS} ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0" ENV NVIDIA_VISIBLE_DEVICES=all -WORKDIR /build - COPY . . -RUN make build +RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build FROM requirements diff --git a/Makefile b/Makefile index 623df92..7a1379c 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp RWKV_VERSION?=f5a8c45396741470583f59b916a2a7641e63bcd0 WHISPER_CPP_VERSION?=57543c169e27312e7546d07ed0d8c6eb806ebc36 BERT_VERSION?=6069103f54b9969c02e789d0fb12a23bd614285f +PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7 BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f export BUILD_TYPE?= CGO_LDFLAGS?= @@ -18,8 +19,9 @@ CUDA_LIBPATH?=/usr/local/cuda/lib64/ STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632 GO_TAGS?= BUILD_ID?=git -LD_FLAGS=?= +LD_FLAGS?= OPTIONAL_TARGETS?= +ESPEAK_DATA?= OS := $(shell uname -s) ARCH := $(shell uname -m) @@ -30,7 +32,7 @@ CYAN := $(shell tput -Txterm setaf 6) RESET := $(shell tput -Txterm sgr0) C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-stable-diffusion/:$(shell pwd)/gpt4all/gpt4all-bindings/golang/:$(shell pwd)/go-ggml-transformers:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp:$(shell pwd)/go-bert:$(shell pwd)/bloomz -LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-stable-diffusion/:$(shell pwd)/gpt4all/gpt4all-bindings/golang/:$(shell pwd)/go-ggml-transformers:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp:$(shell pwd)/go-bert:$(shell pwd)/bloomz +LIBRARY_PATH=$(shell pwd)/go-piper:$(shell pwd)/go-llama:$(shell pwd)/go-stable-diffusion/:$(shell pwd)/gpt4all/gpt4all-bindings/golang/:$(shell pwd)/go-ggml-transformers:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp:$(shell pwd)/go-bert:$(shell pwd)/bloomz ifeq ($(BUILD_TYPE),openblas) CGO_LDFLAGS+=-lopenblas @@ -55,10 +57,15 @@ ifeq ($(STATIC),true) LD_FLAGS=-linkmode external -extldflags -static endif -ifeq ($(GO_TAGS),stablediffusion) +ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion) OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a endif +ifeq ($(findstring tts,$(GO_TAGS)),tts) + OPTIONAL_TARGETS+=go-piper/libpiper_binding.a + OPTIONAL_TARGETS+=backend-assets/espeak-ng-data +endif + .PHONY: all test build vendor all: help @@ -82,6 +89,10 @@ gpt4all: @find ./gpt4all/gpt4all-bindings/golang -type f -name "*.go" -exec sed -i'' -e 's/load_model/load_gpt4all_model/g' {} + @find ./gpt4all/gpt4all-bindings/golang -type f -name "*.h" -exec sed -i'' -e 's/load_model/load_gpt4all_model/g' {} + +## go-piper +go-piper: + git clone --recurse-submodules https://github.com/mudler/go-piper go-piper + cd go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1 ## BERT embeddings go-bert: @@ -133,6 +144,14 @@ backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a @cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true @cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true +backend-assets/espeak-ng-data: + mkdir -p backend-assets/espeak-ng-data +ifdef ESPEAK_DATA + @cp -rf $(ESPEAK_DATA)/. backend-assets/espeak-ng-data +else + @touch backend-assets/espeak-ng-data/keep +endif + gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all $(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a @@ -172,6 +191,9 @@ go-llama: go-llama/libbinding.a: go-llama $(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a +go-piper/libpiper_binding.a: + $(MAKE) -C go-piper libpiper_binding.a example/main + replace: $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama $(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang @@ -181,8 +203,9 @@ replace: $(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert $(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz $(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion + $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper -prepare-sources: go-llama go-ggml-transformers gpt4all go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion replace +prepare-sources: go-llama go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion replace $(GOCMD) mod download ## GENERIC @@ -195,6 +218,7 @@ rebuild: ## Rebuilds the project $(MAKE) -C go-stable-diffusion clean $(MAKE) -C go-bert clean $(MAKE) -C bloomz clean + $(MAKE) -C go-piper clean $(MAKE) build prepare: prepare-sources backend-assets/gpt4all $(OPTIONAL_TARGETS) go-llama/libbinding.a go-bert/libgobert.a go-ggml-transformers/libtransformers.a go-rwkv/librwkv.a whisper.cpp/libwhisper.a bloomz/libbloomz.a ## Prepares for building @@ -210,6 +234,7 @@ clean: ## Remove build related file rm -rf ./go-bert rm -rf ./bloomz rm -rf ./whisper.cpp + rm -rf ./go-piper rm -rf $(BINARY_NAME) rm -rf release/ diff --git a/api/api.go b/api/api.go index 9d34392..6f2ac14 100644 --- a/api/api.go +++ b/api/api.go @@ -128,6 +128,7 @@ func App(opts ...AppOption) (*fiber.App, error) { // audio app.Post("/v1/audio/transcriptions", transcriptEndpoint(cm, options)) + app.Post("/tts", ttsEndpoint(cm, options)) // images app.Post("/v1/images/generations", imageEndpoint(cm, options)) @@ -136,6 +137,10 @@ func App(opts ...AppOption) (*fiber.App, error) { app.Static("/generated-images", options.imageDir) } + if options.audioDir != "" { + app.Static("/generated-audio", options.audioDir) + } + ok := func(c *fiber.Ctx) error { return c.SendStatus(200) } diff --git a/api/localai.go b/api/localai.go new file mode 100644 index 0000000..b719689 --- /dev/null +++ b/api/localai.go @@ -0,0 +1,78 @@ +package api + +import ( + "fmt" + "os" + "path/filepath" + + model "github.com/go-skynet/LocalAI/pkg/model" + "github.com/go-skynet/LocalAI/pkg/tts" + "github.com/go-skynet/LocalAI/pkg/utils" + llama "github.com/go-skynet/go-llama.cpp" + "github.com/gofiber/fiber/v2" +) + +type TTSRequest struct { + Model string `json:"model" yaml:"model"` + Input string `json:"input" yaml:"input"` +} + +func generateUniqueFileName(dir, baseName, ext string) string { + counter := 1 + fileName := baseName + ext + + for { + filePath := filepath.Join(dir, fileName) + _, err := os.Stat(filePath) + if os.IsNotExist(err) { + return fileName + } + + counter++ + fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext) + } +} + +func ttsEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error { + return func(c *fiber.Ctx) error { + + input := new(TTSRequest) + // Get input data from the request body + if err := c.BodyParser(input); err != nil { + return err + } + + piperModel, err := o.loader.BackendLoader(model.PiperBackend, input.Model, []llama.ModelOption{}, uint32(0), o.assetsDestination) + if err != nil { + return err + } + + if piperModel == nil { + return fmt.Errorf("could not load piper model") + } + + w, ok := piperModel.(*tts.Piper) + if !ok { + return fmt.Errorf("loader returned non-piper object %+v", w) + } + + if err := os.MkdirAll(o.audioDir, 0755); err != nil { + return err + } + + fileName := generateUniqueFileName(o.audioDir, "piper", ".wav") + filePath := filepath.Join(o.audioDir, fileName) + + modelPath := filepath.Join(o.loader.ModelPath, input.Model) + + if err := utils.VerifyPath(modelPath, o.loader.ModelPath); err != nil { + return err + } + + if err := w.TTS(input.Input, modelPath, filePath); err != nil { + return err + } + + return c.Download(filePath) + } +} diff --git a/api/options.go b/api/options.go index 2049f42..3d94eaa 100644 --- a/api/options.go +++ b/api/options.go @@ -15,6 +15,7 @@ type Option struct { f16 bool debug, disableMessage bool imageDir string + audioDir string cors bool preloadJSONModels string preloadModelsFromPath string @@ -130,6 +131,12 @@ func WithDisableMessage(disableMessage bool) AppOption { } } +func WithAudioDir(audioDir string) AppOption { + return func(o *Option) { + o.audioDir = audioDir + } +} + func WithImageDir(imageDir string) AppOption { return func(o *Option) { o.imageDir = imageDir diff --git a/entrypoint.sh b/entrypoint.sh index 28c4045..89feaf6 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -5,7 +5,7 @@ cd /build if [ "$REBUILD" != "false" ]; then rm -rf ./local-ai - make build + ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build fi ./local-ai "$@" \ No newline at end of file diff --git a/go.mod b/go.mod index 5ccb06d..f402902 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/go-skynet/LocalAI -go 1.19 +go 1.20 require ( github.com/donomii/go-rwkv.cpp v0.0.0-20230619005719-f5a8c4539674 @@ -52,6 +52,7 @@ require ( github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.19 // indirect github.com/mattn/go-runewidth v0.0.14 // indirect + github.com/mudler/go-piper v0.0.0-00010101000000-000000000000 // indirect github.com/otiai10/mint v1.5.1 // indirect github.com/philhofer/fwd v1.1.2 // indirect github.com/rivo/uniseg v0.2.0 // indirect diff --git a/main.go b/main.go index 16d5e31..dc6968a 100644 --- a/main.go +++ b/main.go @@ -78,7 +78,13 @@ func main() { Name: "image-path", Usage: "Image directory", EnvVars: []string{"IMAGE_PATH"}, - Value: "", + Value: "/tmp/generated/images", + }, + &cli.StringFlag{ + Name: "audio-path", + Usage: "audio directory", + EnvVars: []string{"AUDIO_PATH"}, + Value: "/tmp/generated/audio", }, &cli.StringFlag{ Name: "backend-assets-path", @@ -125,6 +131,7 @@ It uses llama.cpp, ggml and gpt4all as backend with golang c bindings. api.WithContextSize(ctx.Int("context-size")), api.WithDebug(ctx.Bool("debug")), api.WithImageDir(ctx.String("image-path")), + api.WithAudioDir(ctx.String("audio-path")), api.WithF16(ctx.Bool("f16")), api.WithDisableMessage(false), api.WithCors(ctx.Bool("cors")), diff --git a/pkg/gallery/models.go b/pkg/gallery/models.go index 14a7d6a..8d4cd29 100644 --- a/pkg/gallery/models.go +++ b/pkg/gallery/models.go @@ -10,6 +10,7 @@ import ( "path/filepath" "strconv" + "github.com/go-skynet/LocalAI/pkg/utils" "github.com/imdario/mergo" "github.com/rs/zerolog/log" "gopkg.in/yaml.v2" @@ -80,21 +81,6 @@ func ReadConfigFile(filePath string) (*Config, error) { return &config, nil } -func inTrustedRoot(path string, trustedRoot string) error { - for path != "/" { - path = filepath.Dir(path) - if path == trustedRoot { - return nil - } - } - return fmt.Errorf("path is outside of trusted root") -} - -func verifyPath(path, basePath string) error { - c := filepath.Clean(filepath.Join(basePath, path)) - return inTrustedRoot(c, basePath) -} - func Apply(basePath, nameOverride string, config *Config, configOverrides map[string]interface{}, downloadStatus func(string, string, string, float64)) error { // Create base path if it doesn't exist err := os.MkdirAll(basePath, 0755) @@ -110,7 +96,7 @@ func Apply(basePath, nameOverride string, config *Config, configOverrides map[st for _, file := range config.Files { log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename) - if err := verifyPath(file.Filename, basePath); err != nil { + if err := utils.VerifyPath(file.Filename, basePath); err != nil { return err } // Create file path @@ -196,7 +182,7 @@ func Apply(basePath, nameOverride string, config *Config, configOverrides map[st // Write prompt template contents to separate files for _, template := range config.PromptTemplates { - if err := verifyPath(template.Name+".tmpl", basePath); err != nil { + if err := utils.VerifyPath(template.Name+".tmpl", basePath); err != nil { return err } // Create file path @@ -221,7 +207,7 @@ func Apply(basePath, nameOverride string, config *Config, configOverrides map[st name = nameOverride } - if err := verifyPath(name+".yaml", basePath); err != nil { + if err := utils.VerifyPath(name+".yaml", basePath); err != nil { return err } diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index 0091ee4..3849f85 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -9,6 +9,7 @@ import ( whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper" "github.com/go-skynet/LocalAI/pkg/langchain" "github.com/go-skynet/LocalAI/pkg/stablediffusion" + "github.com/go-skynet/LocalAI/pkg/tts" bloomz "github.com/go-skynet/bloomz.cpp" bert "github.com/go-skynet/go-bert.cpp" transformers "github.com/go-skynet/go-ggml-transformers.cpp" @@ -39,6 +40,7 @@ const ( RwkvBackend = "rwkv" WhisperBackend = "whisper" StableDiffusionBackend = "stablediffusion" + PiperBackend = "piper" LCHuggingFaceBackend = "langchain-huggingface" ) @@ -103,6 +105,12 @@ var stableDiffusion = func(assetDir string) (interface{}, error) { return stablediffusion.New(assetDir) } +func piperTTS(assetDir string) func(s string) (interface{}, error) { + return func(s string) (interface{}, error) { + return tts.New(assetDir) + } +} + var whisperModel = func(modelFile string) (interface{}, error) { return whisper.New(modelFile) } @@ -158,6 +166,8 @@ func (ml *ModelLoader) BackendLoader(backendString string, modelFile string, lla return ml.LoadModel(modelFile, replit) case StableDiffusionBackend: return ml.LoadModel(modelFile, stableDiffusion) + case PiperBackend: + return ml.LoadModel(modelFile, piperTTS(filepath.Join(assetDir, "backend-assets", "espeak-ng-data"))) case StarcoderBackend: return ml.LoadModel(modelFile, starCoder) case Gpt4AllLlamaBackend, Gpt4AllMptBackend, Gpt4AllJBackend, Gpt4All: diff --git a/pkg/tts/generate.go b/pkg/tts/generate.go new file mode 100644 index 0000000..e4722d4 --- /dev/null +++ b/pkg/tts/generate.go @@ -0,0 +1,12 @@ +//go:build tts +// +build tts + +package tts + +import ( + piper "github.com/mudler/go-piper" +) + +func tts(text, model, assetDir, arLib, dst string) error { + return piper.TextToWav(text, model, assetDir, arLib, dst) +} diff --git a/pkg/tts/generate_unsupported.go b/pkg/tts/generate_unsupported.go new file mode 100644 index 0000000..3092695 --- /dev/null +++ b/pkg/tts/generate_unsupported.go @@ -0,0 +1,10 @@ +//go:build !tts +// +build !tts + +package tts + +import "fmt" + +func tts(text, model, assetDir, arLib, dst string) error { + return fmt.Errorf("this version of LocalAI was built without the tts tag") +} diff --git a/pkg/tts/piper.go b/pkg/tts/piper.go new file mode 100644 index 0000000..b76a637 --- /dev/null +++ b/pkg/tts/piper.go @@ -0,0 +1,20 @@ +package tts + +import "os" + +type Piper struct { + assetDir string +} + +func New(assetDir string) (*Piper, error) { + if _, err := os.Stat(assetDir); err != nil { + return nil, err + } + return &Piper{ + assetDir: assetDir, + }, nil +} + +func (s *Piper) TTS(text, model, dst string) error { + return tts(text, model, s.assetDir, "", dst) +} diff --git a/pkg/utils/path.go b/pkg/utils/path.go new file mode 100644 index 0000000..5808512 --- /dev/null +++ b/pkg/utils/path.go @@ -0,0 +1,22 @@ +package utils + +import ( + "fmt" + "path/filepath" +) + +func inTrustedRoot(path string, trustedRoot string) error { + for path != "/" { + path = filepath.Dir(path) + if path == trustedRoot { + return nil + } + } + return fmt.Errorf("path is outside of trusted root") +} + +// VerifyPath verifies that path is based in basePath. +func VerifyPath(path, basePath string) error { + c := filepath.Clean(filepath.Join(basePath, path)) + return inTrustedRoot(c, basePath) +}