From f8ee20991c832c8eeea0e7cd244b37b9d5692d07 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 10 May 2023 15:20:21 +0200
Subject: [PATCH] feat: add bert.cpp embeddings (#222)

---
 .github/workflows/bump_deps.yaml              |  3 ++
 Makefile                                      | 25 ++++++++--
 README.md                                     | 13 +++--
 api/api_test.go                               |  2 +-
 api/prediction.go                             |  9 ++++
 examples/query_data/README.md                 |  9 ++--
 examples/query_data/models/embeddings.yaml    | 16 +-----
 examples/query_data/models/gpt-3.5-turbo.yaml |  5 +-
 examples/query_data/models/wizardlm.tmpl      |  3 --
 examples/query_data/query.py                  |  2 +-
 examples/query_data/store.py                  |  6 +--
 go.mod                                        |  1 +
 go.sum                                        | 13 +----
 pkg/model/loader.go                           | 50 ++++++++++++++++++-
 14 files changed, 104 insertions(+), 53 deletions(-)
 delete mode 100644 examples/query_data/models/wizardlm.tmpl

diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index aaa5671..c889fab 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -24,6 +24,9 @@ jobs:
           - repository: "ggerganov/whisper.cpp"
             variable: "WHISPER_CPP_VERSION"
             branch: "master"
+          - repository: "go-skynet/go-bert.cpp"
+            variable: "BERT_VERSION"
+            branch: "master"
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
diff --git a/Makefile b/Makefile
index bffb161..03ece68 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ GOGPT2_VERSION?=245a5bfe6708ab80dc5c733dcdbfbe3cfd2acdaa
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=af62fcc432be2847acb6e0688b2c2491d6588d58
 WHISPER_CPP_VERSION?=bf2449dfae35a46b2cd92ab22661ce81a48d4993
+BERT_VERSION?=ec771ec715576ac050263bb7bb74bfd616a5ba13
 
 
 GREEN  := $(shell tput -Txterm setaf 2)
@@ -17,8 +18,8 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)
 
-C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp
-LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp
+C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp:$(shell pwd)/go-bert
+LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp:$(shell pwd)/go-bert
 
 # Use this if you want to set the default behavior
 ifndef BUILD_TYPE
@@ -49,6 +50,14 @@ go-gpt4all-j:
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/void replace/void json_gptj_replace/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/::replace/::json_gptj_replace/g' {} +
 
+## BERT embeddings
+go-bert:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp go-bert
+	cd go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+	@find ./go-bert -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_bert_/g' {} +
+	@find ./go-bert -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_bert_/g' {} +
+	@find ./go-bert -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_bert_/g' {} +
+
 ## RWKV
 go-rwkv:
 	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
@@ -60,6 +69,9 @@ go-rwkv:
 go-rwkv/librwkv.a: go-rwkv
 	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a .. && cp ggml/src/libggml.a ..
 
+go-bert/libgobert.a: go-bert
+	$(MAKE) -C go-bert libgobert.a
+
 go-gpt4all-j/libgptj.a: go-gpt4all-j
 	$(MAKE) -C go-gpt4all-j $(GENERIC_PREFIX)libgptj.a
 
@@ -98,8 +110,9 @@ replace:
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt2.cpp=$(shell pwd)/go-gpt2
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
 
-prepare-sources: go-llama go-gpt2 go-gpt4all-j go-rwkv whisper.cpp
+prepare-sources: go-llama go-gpt2 go-gpt4all-j go-rwkv whisper.cpp go-bert
 	$(GOCMD) mod download
 
 ## GENERIC
@@ -109,15 +122,17 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C go-gpt2 clean
 	$(MAKE) -C go-rwkv clean
 	$(MAKE) -C whisper.cpp clean
+	$(MAKE) -C go-bert clean
 	$(MAKE) build
 
-prepare: prepare-sources go-llama/libbinding.a go-gpt4all-j/libgptj.a go-gpt2/libgpt2.a go-rwkv/librwkv.a whisper.cpp/libwhisper.a replace ## Prepares for building
+prepare: prepare-sources go-llama/libbinding.a go-gpt4all-j/libgptj.a go-bert/libgobert.a go-gpt2/libgpt2.a go-rwkv/librwkv.a whisper.cpp/libwhisper.a replace ## Prepares for building
 
 clean: ## Remove build related file
 	rm -fr ./go-llama
 	rm -rf ./go-gpt4all-j
 	rm -rf ./go-gpt2
 	rm -rf ./go-rwkv
+	rm -rf ./go-bert
 	rm -rf $(BINARY_NAME)
 
 ## Build:
@@ -141,7 +156,7 @@ test-models/testmodel:
 
 test: prepare test-models/testmodel
 	cp tests/fixtures/* test-models
-	@C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo -v -r ./...
+	@C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo -v -r ./api
 
 ## Help:
 help: ## Show this help.
diff --git a/README.md b/README.md
index 87eb2f1..c77d789 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 
 [![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 
 
-**LocalAI** is a drop-in replacement REST API compatible with OpenAI for local CPU inferencing. It allows to run models locally or on-prem with consumer grade hardware. It is based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all), [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is licensed under Apache 2.0.
+**LocalAI** is a drop-in replacement REST API compatible with OpenAI for local CPU inferencing. It allows to run models locally or on-prem with consumer grade hardware, supporting multiple models families. Supports also GPT4ALL-J which is licensed under Apache 2.0.
 
 - OpenAI compatible API
 - Supports multiple models
@@ -19,10 +19,14 @@
 
 LocalAI is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! It was initially created by [mudler](https://github.com/mudler/) at the [SpectroCloud OSS Office](https://github.com/spectrocloud).
 
+LocalAI uses C++ bindings for optimizing speed. It is based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all), [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp), [ggml](https://github.com/ggerganov/ggml), [whisper.cpp](https://github.com/ggerganov/whisper.cpp) for audio transcriptions, and [bert.cpp](https://github.com/skeskinen/bert.cpp) for embedding.
+
 See [examples on how to integrate LocalAI](https://github.com/go-skynet/LocalAI/tree/master/examples/).
 
 ## News
 
+- 10-05-2023: Added support for fast and accurate embeddings with `bert.cpp` ( https://github.com/go-skynet/LocalAI/pull/222 )
+- 09-05-2023: Added experimental support for transcriptions endpoint ( https://github.com/go-skynet/LocalAI/pull/211 )
 - 08-05-2023: Support for embeddings with models using the `llama.cpp` backend ( https://github.com/go-skynet/LocalAI/pull/207 )
 - 02-05-2023: Support for `rwkv.cpp` models ( https://github.com/go-skynet/LocalAI/pull/158 ) and for `/edits` endpoint
 - 01-05-2023: Support for SSE stream of tokens in `llama.cpp` backends ( https://github.com/go-skynet/LocalAI/pull/152 )
@@ -534,18 +538,18 @@ curl http://localhost:8080/v1/models
 
 <details>
 
-The embedding endpoint is experimental and enabled only if the model is configured with `emebddings: true` in its `yaml` file, for example:
+The embedding endpoint is experimental and enabled only if the model is configured with `embeddings: true` in its `yaml` file, for example:
 
 ```yaml
 name: text-embedding-ada-002
 parameters:
-  model: wizardLM-7B.ggml.q5_1.bin
+  model: bert
 embeddings: true
 ```
 
 There is an example available [here](https://github.com/go-skynet/LocalAI/tree/master/examples/query_data/).
 
-Note: embeddings is supported only with `llama.cpp` compatible models. (doesn't work with gpt4-all-j, yet).
+Note: embeddings is supported only with `llama.cpp` compatible models and `bert` models. bert is more performant and available independently of the LLM model.
 
 </details>
 
@@ -667,6 +671,7 @@ MIT
 - [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - [go-skynet/go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp)
 - [go-skynet/go-gpt2.cpp](https://github.com/go-skynet/go-gpt2.cpp)
+- [go-skynet/go-bert.cpp](https://github.com/go-skynet/go-bert.cpp)
 - [donomii/go-rwkv.cpp](https://github.com/donomii/go-rwkv.cpp)
 
 ## Acknowledgements
diff --git a/api/api_test.go b/api/api_test.go
index 9682a21..0a58191 100644
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -79,7 +79,7 @@ var _ = Describe("API test", func() {
 		It("returns errors", func() {
 			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
 			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 5 errors occurred:"))
+			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 6 errors occurred:"))
 		})
 
 	})
diff --git a/api/prediction.go b/api/prediction.go
index 95d111f..f6c6192 100644
--- a/api/prediction.go
+++ b/api/prediction.go
@@ -8,6 +8,7 @@ import (
 
 	"github.com/donomii/go-rwkv.cpp"
 	model "github.com/go-skynet/LocalAI/pkg/model"
+	bert "github.com/go-skynet/go-bert.cpp"
 	gpt2 "github.com/go-skynet/go-gpt2.cpp"
 	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
@@ -62,6 +63,14 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c Config)
 			}
 			return model.Embeddings(s, predictOptions...)
 		}
+	// bert embeddings
+	case *bert.Bert:
+		fn = func() ([]float32, error) {
+			if len(tokens) > 0 {
+				return nil, fmt.Errorf("embeddings endpoint for this model supports only string")
+			}
+			return model.Embeddings(s, bert.SetThreads(c.Threads))
+		}
 	default:
 		fn = func() ([]float32, error) {
 			return nil, fmt.Errorf("embeddings not supported by the backend")
diff --git a/examples/query_data/README.md b/examples/query_data/README.md
index 9185709..f7a4e1f 100644
--- a/examples/query_data/README.md
+++ b/examples/query_data/README.md
@@ -12,11 +12,7 @@ Summary of the steps:
 
 ## Requirements
 
-For this in order to work, you will need LocalAI and a model compatible with the `llama.cpp` backend. This is will not work with gpt4all, however you can mix models (use a llama.cpp one to build the index database, and gpt4all to query it).
-
-The example uses `WizardLM` for both embeddings and Q&A. Edit the config files in `models/` accordingly to specify the model you use (change `HERE` in the configuration files).
-
-You will also need a training data set. Copy that over `data`.
+You will need a training data set. Copy that over `data`.
 
 ## Setup
 
@@ -28,7 +24,8 @@ git clone https://github.com/go-skynet/LocalAI
 
 cd LocalAI/examples/query_data
 
-# Copy your models, edit config files accordingly
+wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O models/bert
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
 
 # start with docker-compose
 docker-compose up -d --build
diff --git a/examples/query_data/models/embeddings.yaml b/examples/query_data/models/embeddings.yaml
index 2173975..b90ca75 100644
--- a/examples/query_data/models/embeddings.yaml
+++ b/examples/query_data/models/embeddings.yaml
@@ -1,18 +1,6 @@
 name: text-embedding-ada-002
 parameters:
-  model: HERE
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
+  model: bert
 threads: 14
-stopwords:
-- "HUMAN:"
-- "GPT:"
-roles:
-  user: " "
-  system: " "
+backend: bert-embeddings
 embeddings: true
-template:
-  completion: completion
-  chat: gpt4all
diff --git a/examples/query_data/models/gpt-3.5-turbo.yaml b/examples/query_data/models/gpt-3.5-turbo.yaml
index 9cdb4a2..6df1dbf 100644
--- a/examples/query_data/models/gpt-3.5-turbo.yaml
+++ b/examples/query_data/models/gpt-3.5-turbo.yaml
@@ -1,12 +1,11 @@
 name: gpt-3.5-turbo
 parameters:
-  model: HERE
+  model: ggml-gpt4all-j
   top_k: 80
   temperature: 0.2
   top_p: 0.7
 context_size: 1024
 threads: 14
-embeddings: true
 stopwords:
 - "HUMAN:"
 - "GPT:"
@@ -15,4 +14,4 @@ roles:
   system: " "
 template:
   completion: completion
-  chat: wizardlm
+  chat: gpt4all
\ No newline at end of file
diff --git a/examples/query_data/models/wizardlm.tmpl b/examples/query_data/models/wizardlm.tmpl
deleted file mode 100644
index e7b1985..0000000
--- a/examples/query_data/models/wizardlm.tmpl
+++ /dev/null
@@ -1,3 +0,0 @@
-{{.Input}}
-
-### Response:
\ No newline at end of file
diff --git a/examples/query_data/query.py b/examples/query_data/query.py
index 0f1408e..e3dcde2 100644
--- a/examples/query_data/query.py
+++ b/examples/query_data/query.py
@@ -13,7 +13,7 @@ base_path = os.environ.get('OPENAI_API_BASE', 'http://localhost:8080/v1')
 llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_base=base_path))
 
 # Configure prompt parameters and initialise helper
-max_input_size = 1024
+max_input_size = 500
 num_output = 256
 max_chunk_overlap = 20
 
diff --git a/examples/query_data/store.py b/examples/query_data/store.py
index e029694..0d628c8 100644
--- a/examples/query_data/store.py
+++ b/examples/query_data/store.py
@@ -13,15 +13,15 @@ base_path = os.environ.get('OPENAI_API_BASE', 'http://localhost:8080/v1')
 llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_base=base_path))
 
 # Configure prompt parameters and initialise helper
-max_input_size = 512
-num_output = 512
+max_input_size = 400
+num_output = 400
 max_chunk_overlap = 30
 
 prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
 
 # Load documents from the 'data' directory
 documents = SimpleDirectoryReader('data').load_data()
-service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, chunk_size_limit = 512)
+service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, chunk_size_limit = 400)
 index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
 index.storage_context.persist(persist_dir="./storage")
 
diff --git a/go.mod b/go.mod
index c25c9e3..59c9297 100644
--- a/go.mod
+++ b/go.mod
@@ -6,6 +6,7 @@ require (
 	github.com/donomii/go-rwkv.cpp v0.0.0-20230503112711-af62fcc432be
 	github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230508180809-bf2449dfae35
 	github.com/go-audio/wav v1.1.0
+	github.com/go-skynet/go-bert.cpp v0.0.0-20230510101404-7bb183b147ea
 	github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708
 	github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c
 	github.com/go-skynet/go-llama.cpp v0.0.0-20230509080828-f4d26f43f1d3
diff --git a/go.sum b/go.sum
index 400d9ff..538fbeb 100644
--- a/go.sum
+++ b/go.sum
@@ -16,8 +16,6 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/donomii/go-rwkv.cpp v0.0.0-20230503112711-af62fcc432be h1:3Hic97PY6hcw/SY44RuR7kyONkxd744RFeRrqckzwNQ=
-github.com/donomii/go-rwkv.cpp v0.0.0-20230503112711-af62fcc432be/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230508180809-bf2449dfae35 h1:sMg/SgnMPS/HNUO/2kGm72vl8R9TmNIwgLFr2TNwR3g=
 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230508180809-bf2449dfae35/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
 github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@@ -38,14 +36,6 @@ github.com/go-openapi/spec v0.20.4/go.mod h1:faYFR1CvsJZ0mNsmsphTMSoRrNV3TEDoAM7
 github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk=
 github.com/go-openapi/swag v0.19.15 h1:D2NRCBzS9/pEY3gP9Nl8aDqGUcPFrwG2p+CNFrLyrCM=
 github.com/go-openapi/swag v0.19.15/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ=
-github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708 h1:cfOi4TWvQ6JsAm9Q1A8I8j9YfNy10bmIfwOiyGyU5wQ=
-github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708/go.mod h1:1Wj/xbkMfwQSOrhNYK178IzqQHstZbRfhx4s8p1M5VM=
-github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c h1:48I7jpLNGiQeBmF0SFVVbREh8vlG0zN13v9LH5ctXis=
-github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c/go.mod h1:5VZ9XbcINI0XcHhkcX8GPK8TplFGAzu1Hrg4tNiMCtI=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230508165257-c03e8adbc45c h1:JoW2+LKrSemoV32QRwrEC5f53erym96NCsUSM3wSVbM=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230508165257-c03e8adbc45c/go.mod h1:DLfsPD7tYYnpksERH83HSf7qVNW3FIwmz7/zfYO0/6I=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230509080828-f4d26f43f1d3 h1:YNi1oetK5kGJoUgT3/r/Wj3XPOICWf3nwHsz5v89iSs=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230509080828-f4d26f43f1d3/go.mod h1:DLfsPD7tYYnpksERH83HSf7qVNW3FIwmz7/zfYO0/6I=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
@@ -197,8 +187,9 @@ golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8T
 google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU=
 gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b h1:QRR6H1YWRnHb4Y/HeNFCTJLFVxaq6wH4YuVdsUOr75U=
+gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
diff --git a/pkg/model/loader.go b/pkg/model/loader.go
index 167d5d7..8df4ec9 100644
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -14,6 +14,7 @@ import (
 	"github.com/rs/zerolog/log"
 
 	rwkv "github.com/donomii/go-rwkv.cpp"
+	bert "github.com/go-skynet/go-bert.cpp"
 	gpt2 "github.com/go-skynet/go-gpt2.cpp"
 	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
@@ -22,13 +23,15 @@ import (
 type ModelLoader struct {
 	ModelPath string
 	mu        sync.Mutex
-
+	// TODO: this needs generics
 	models            map[string]*llama.LLama
 	gptmodels         map[string]*gptj.GPTJ
 	gpt2models        map[string]*gpt2.GPT2
 	gptstablelmmodels map[string]*gpt2.StableLM
 	rwkv              map[string]*rwkv.RwkvState
-	promptsTemplates  map[string]*template.Template
+	bert              map[string]*bert.Bert
+
+	promptsTemplates map[string]*template.Template
 }
 
 func NewModelLoader(modelPath string) *ModelLoader {
@@ -39,6 +42,7 @@ func NewModelLoader(modelPath string) *ModelLoader {
 		gptstablelmmodels: make(map[string]*gpt2.StableLM),
 		models:            make(map[string]*llama.LLama),
 		rwkv:              make(map[string]*rwkv.RwkvState),
+		bert:              make(map[string]*bert.Bert),
 		promptsTemplates:  make(map[string]*template.Template),
 	}
 }
@@ -156,6 +160,38 @@ func (ml *ModelLoader) LoadStableLMModel(modelName string) (*gpt2.StableLM, erro
 	return model, err
 }
 
+func (ml *ModelLoader) LoadBERT(modelName string) (*bert.Bert, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.bert[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.ModelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := bert.New(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.bert[modelName] = model
+	return model, err
+}
+
 func (ml *ModelLoader) LoadGPT2Model(modelName string) (*gpt2.GPT2, error) {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
@@ -299,6 +335,8 @@ func (ml *ModelLoader) BackendLoader(backendString string, modelFile string, lla
 		return ml.LoadGPT2Model(modelFile)
 	case "gptj":
 		return ml.LoadGPTJModel(modelFile)
+	case "bert-embeddings":
+		return ml.LoadBERT(modelFile)
 	case "rwkv":
 		return ml.LoadRWKV(modelFile, modelFile+tokenizerSuffix, threads)
 	default:
@@ -361,5 +399,13 @@ func (ml *ModelLoader) GreedyLoader(modelFile string, llamaOpts []llama.ModelOpt
 		err = multierror.Append(err, modelerr)
 	}
 
+	model, modelerr = ml.LoadBERT(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
 	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
 }